Changeset 745 for trunk/server/lib/tdb

trunk/server

Property svn:mergeinfo changed
/vendor/current merged: 581,587,591,594,597,600,615,618,740

trunk/server/lib/tdb/common/check.c

-              r414
+              r745
+{
         struct tdb_header hdr;
+        if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), DOCONV()) == -1)
+        uint32_t h1, h2;
+        if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
                 return false;
         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
 …
                 goto corrupt;
+        if (hdr.rwlocks != 0)
+        if (hdr.rwlocks != 0 && hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
+                goto corrupt;
+        tdb_header_hash(tdb, &h1, &h2);
+        if (hdr.magic1_hash && hdr.magic2_hash &&
+            (hdr.magic1_hash != h1 || hdr.magic2_hash != h2))
                 goto corrupt;
 …
+}
+int tdb_check(struct tdb_context *tdb,
+/* Slow, but should be very rare. */
+size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off)
+{
+        size_t len;
+        for (len = 0; off + len < tdb->map_size; len++) {
+                char c;
+                if (tdb->methods->tdb_read(tdb, off, &c, 1, 0))
+                        return 0;
+                if (c != 0 && c != 0x42)
+                        break;
+        }
+        return len;
+}
+_PUBLIC_ int tdb_check(struct tdb_context *tdb,
               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
               void *private_data)
 …
         struct tdb_record rec;
         bool found_recovery = false;
+        if (tdb_lockall(tdb) == -1)
+                return -1;
+        tdb_len_t dead;
+        bool locked;
+        /* Read-only databases use no locking at all: it's best-effort.
+         * We may have a write lock already, so skip that case too. */
+        if (tdb->read_only || tdb->allrecord_lock.count != 0) {
+                locked = false;
+        } else {
+                if (tdb_lockall_read(tdb) == -1)
+                        return -1;
+                locked = true;
+        }
         /* Make sure we know true size of the underlying file. */
 …
                                 goto free;
                         break;
+                /* If we crash after ftruncate, we can get zeroes or fill. */
+                case TDB_RECOVERY_INVALID_MAGIC:
+                case 0x42424242:
+                        if (recovery_start == off) {
+                                found_recovery = true;
+                                break;
+                        }
+                        dead = tdb_dead_space(tdb, off);
+                        if (dead < sizeof(rec))
+                                goto corrupt;
+                        TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                                 "Dead space at %d-%d (of %u)\n",
+                                 off, off + dead, tdb->map_size));
+                        rec.rec_len = dead - sizeof(rec);
+                        break;
                 case TDB_RECOVERY_MAGIC:
-                case 0: /* Used for invalid (or in-progress) recovery area. */
                         if (recovery_start != off) {
                                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 …
                         found_recovery = true;
                         break;
+                default:
+                default: ;
+                corrupt:
                         tdb->ecode = TDB_ERR_CORRUPT;
                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
 …
         if (recovery_start != 0 && !found_recovery) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                         "Expected %s recovery area, got %s\n",
+                         recovery_start ? "a" : "no",
+                         found_recovery ? "one" : "none"));
+                         "Expected a recovery area at %u\n",
+                         recovery_start));
                 goto free;
+        }
         free(hashes);
+        tdb_unlockall(tdb);
+        if (locked) {
+                tdb_unlockall_read(tdb);
+        }
         return 0;
 …
         free(hashes);
 unlock:
+        tdb_unlockall(tdb);
+        if (locked) {
+                tdb_unlockall_read(tdb);
+        }
         return -1;
+}

trunk/server/lib/tdb/common/dump.c

-              r414
+              r745
    Copyright (C) Paul `Rusty' Russell              2000
    Copyright (C) Jeremy Allison                    2000-2003
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
+}
 void tdb_dump_all(struct tdb_context *tdb)
+_PUBLIC_ void tdb_dump_all(struct tdb_context *tdb)
+{
         int i;
 …
+}
 int tdb_printfreelist(struct tdb_context *tdb)
+_PUBLIC_ int tdb_printfreelist(struct tdb_context *tdb)
+{
         int ret;

trunk/server/lib/tdb/common/error.c

-              r414
+              r745
    Copyright (C) Paul `Rusty' Russell              2000
    Copyright (C) Jeremy Allison                    2000-2003
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
 #include "tdb_private.h"
 enum TDB_ERROR tdb_error(struct tdb_context *tdb)
+_PUBLIC_ enum TDB_ERROR tdb_error(struct tdb_context *tdb)
+{
         return tdb->ecode;
 …
 /* Error string for the last tdb error */
 const char *tdb_errorstr(struct tdb_context *tdb)
+_PUBLIC_ const char *tdb_errorstr(struct tdb_context *tdb)
+{
         uint32_t i;

trunk/server/lib/tdb/common/freelist.c

-              r414
+              r745
    Copyright (C) Paul `Rusty' Russell              2000
    Copyright (C) Jeremy Allison                    2000-2003
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
 /* Add an element into the freelist. Merge adjacent records if
    neccessary. */
+   necessary. */
 int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
+{
 …
                 struct tdb_record l;
                 tdb_off_t leftsize;
                 /* Read in tailer and jump back to header */
                 if (tdb_ofs_read(tdb, left, &leftsize) == -1) {
 …
                         break;
+                }
                 /* this multiplier means we only extremely rarely
                    search more than 50 or so records. At 50 records we
 …
    return the size of the freelist - used to decide if we should repack
 */
 int tdb_freelist_size(struct tdb_context *tdb)
+_PUBLIC_ int tdb_freelist_size(struct tdb_context *tdb)
+{
         tdb_off_t ptr;

trunk/server/lib/tdb/common/freelistcheck.c

r414	r745
44	44	}
45	45
46		int tdb_validate_freelist(struct tdb_context tdb, int pnum_entries)
	46	_PUBLIC_ int tdb_validate_freelist(struct tdb_context tdb, int pnum_entries)
47	47	{
48	48	struct tdb_context *mem_tdb = NULL;

trunk/server/lib/tdb/common/io.c

-              r599
+              r745
    Copyright (C) Paul `Rusty' Russell              2000
    Copyright (C) Jeremy Allison                    2000-2003
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
         // the owner (us) is not allowed to write to the file (different from unix)
         TDB_LOG((tdb, TDB_DEBUG_TRACE,"unlocking at %d len=%d before writing.\n", off, len));
         tdb_brlock( tdb, off, F_UNLCK, F_SETLK, 0, 1);
+        tdb_brunlock( tdb, F_RDLCK, off, len);
         // if a wider previous lock is in effect, we cannot write lock our segment
         // (e.g. a lock_upgrade locks all the file), so we hope the previous lock
         // is a write lock: do not wait for lock.
         tdb_brlock( tdb, off, F_WRLCK, F_SETLK, 0, len);
+        tdb_brlock( tdb, F_WRLCK, off, len, F_SETLK);
 #endif
 …
                                  "write %d bytes at %d in two attempts\n",
                                  len, off));
 #ifdef __OS2__ // remove our lock
                         tdb_brlock( tdb, off, F_UNLCK, F_SETLK, 0, len);
+                        tdb_brunlock( tdb, F_WRLCK, off, len);
 #endif
                         return -1;
+                }
+        }
 #ifdef __OS2__ // remove our lock
         tdb_brlock( tdb, off, F_UNLCK, F_SETLK, 0, len);
+                        tdb_brunlock( tdb, F_WRLCK, off, len);
 #endif
         return 0;
+}
 …
         tdb_oob,
         tdb_expand_file,
-        tdb_brlock
 };

trunk/server/lib/tdb/common/lock.c

-              r647
+              r745
    Copyright (C) Paul `Rusty' Russell              2000
    Copyright (C) Jeremy Allison                    2000-2003
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
 #include "tdb_private.h"
-#define TDB_MARK_LOCK 0x80000000
 #ifdef __OS2__
+static char* lock_type( int lck)
+{
+        static char buffer[16];
+        switch(lck) {
+        case F_GETLK: return "F_GETLK";
+        case F_SETLK: return "F_SETLK";
+        case F_SETLKW: return "F_SETLKW";
+        default:
+                sprintf( buffer, "unknown %d", lck);
+        }
+        return buffer;
+static char* lock_type( bool waitflag)
+{
+        if (waitflag)
+                return "F_SETLKW";
+        else
+                return "F_SETLK";
+}
 …
 static int _mutex_brlock(struct tdb_context *tdb, tdb_off_t offset,
                int rw_type, int lck_type, int probe, size_t len)
+               int rw_type, bool waitflag, size_t len)
+{
         HMTX    hSem;
 …
         switch( offset) {
         case GLOBAL_LOCK:
+        case OPEN_LOCK:
                 hSem = tdb->hGlobalLock;
                 break;
 …
         TDB_LOG((tdb, TDB_DEBUG_TRACE,"_mutex_brlock handle %d, offset %d\n", hSem, offset));
         if (lck_type == F_SETLKW)
+        if (waitflag)
                 ulTimeout = SEM_INDEFINITE_WAIT;
         else
 …
         errno = EINVAL;
         TDB_LOG(( tdb, TDB_DEBUG_ERROR, "_mutex_brlock pid %X, failed (fd=%d) at offset %d rw_type=%d lck_type=%d len=%d, rc=%d\n",
                  getpid(), tdb->fd, offset, rw_type, lck_type, (int)len, rc));
+                 getpid(), tdb->fd, offset, rw_type, waitflag, (int)len, rc));
         tdb->ecode = TDB_ERR_LOCK;
         return -1;
 …
 #endif
 void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
+_PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
+{
         tdb->interrupt_sig_ptr = ptr;
+}
+/* a byte range locking function - return 0 on success
+   this functions locks/unlocks 1 byte at the specified offset.
+   On error, errno is also set so that errors are passed back properly
+   through tdb_open().
+   note that a len of zero means lock to end of file
+*/
+int tdb_brlock(struct tdb_context *tdb, tdb_off_t offset,
+               int rw_type, int lck_type, int probe, size_t len)
+{
+static int fcntl_lock(struct tdb_context *tdb,
+                      int rw, tdb_off_t off, size_t len, bool waitflag)
+{
 #ifdef __OS2__
         APIRET      rc;
         ULONG       fAccess = 0;
         int         fLock = 0;
+        ULONG       fAccess = 0; // default exclusiv
+        int         fLock = 1;   // default lock
         off_t       cbFile;
         off_t       offStart;
         off_t       cbRange;
+        TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_brlock pid %X, fd %d, lck_type %s, rw_type %s, offset %d, len %d\n",
+                getpid(), tdb->fd, lock_type(lck_type), read_type(rw_type), offset, len));
+        if (tdb->flags & TDB_NOLOCK) {
+                return 0;
+        }
+        if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
+                tdb->ecode = TDB_ERR_RDONLY;
+                return -1;
+        }
+        FILELOCK    lockArea = {0}, unlockArea = {0};
+        TDB_LOG((tdb, TDB_DEBUG_TRACE, "fcntl_lock in pid %X, fd %d, lck_type %s, rw_type %s, offset %d, len %d\n",
+                getpid(), tdb->fd, lock_type(waitflag), read_type(rw), off, len));
         switch( offset) {
         case GLOBAL_LOCK:
+        switch(off) {
+        case OPEN_LOCK:
         case ACTIVE_LOCK:
         case TRANSACTION_LOCK:
                 return _mutex_brlock( tdb, offset, rw_type,  lck_type,  probe, len);
+                return _mutex_brlock(tdb, off, rw, waitflag, len);
+        }
         /* flags and order */
+        fAccess = 0; /* exclusive */
+        switch (rw_type)
+        switch (rw)
+        {
                 case F_UNLCK:
                         fLock = 0;
+                        unlockArea.lOffset = off;
+                        unlockArea.lRange  = len ? len : tdb->header.hash_size *4; //was LONG_MAX
                         break;
                 case F_RDLCK:
+                        lockArea.lOffset = off;
+                        lockArea.lRange  = len ? len : LONG_MAX;
                         fAccess = 1; /* read-only */
+                        break;
                 case F_WRLCK:
+                        fLock = 1;
+                        lockArea.lOffset = off;
+                        lockArea.lRange  = len ? len : LONG_MAX;
                         break;
                 default:
 …
+        }
+        FILELOCK   aflock[2];
+        bzero(&aflock[(fLock + 1) & 1], sizeof(aflock[0]));
+        aflock[fLock].lOffset = offset;
+        aflock[fLock].lRange  = len ? len : LONG_MAX;
+        rc = DosSetFileLocks(tdb->fd, &aflock[0], &aflock[1], SEM_IMMEDIATE_RETURN, fAccess);
+        if (rc != NO_ERROR && lck_type == F_SETLKW) {
+        rc = DosSetFileLocks(tdb->fd, &unlockArea, &lockArea, SEM_IMMEDIATE_RETURN, fAccess);
+        if (rc != NO_ERROR && waitflag) {
                 int     count = 20;
                 do {
                         rc = DosSetFileLocks(tdb->fd, &aflock[0], &aflock[1], 100, fAccess);
+                        rc = DosSetFileLocks(tdb->fd, &unlockArea, &lockArea, 100, fAccess);
                         count--;
                 } while( count>0 && rc !=NO_ERROR);
+        }
+        TDB_LOG(( tdb, TDB_DEBUG_TRACE, "fcntl_lock out pid %X, fd %d, lck_type %s, rw_type %s, offset %d, len %d, rc=%d\n",
+                getpid(), tdb->fd, lock_type(waitflag), read_type(rw), off, len, rc));
         if (rc != NO_ERROR) {
                 errno  = EINVAL;
+                /* Generic lock error. errno set by fcntl.
+                 * EAGAIN is an expected return from non-blocking
+                 * locks. */
+                if (!probe && lck_type != F_SETLK) {
+                        /* Ensure error code is set for log fun to examine. */
+                        tdb->ecode = TDB_ERR_LOCK;
+                        TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d len=%d\n",
+                                 tdb->fd, offset, rw_type, lck_type, (int)len));
+                return -1;
+        }
+#else
+        struct flock fl;
+        fl.l_type = rw;
+        fl.l_whence = SEEK_SET;
+        fl.l_start = off;
+        fl.l_len = len;
+        fl.l_pid = 0;
+        if (waitflag)
+                return fcntl(tdb->fd, F_SETLKW, &fl);
+        else
+                return fcntl(tdb->fd, F_SETLK, &fl);
+#endif
+}
+static int fcntl_unlock(struct tdb_context *tdb, int rw, tdb_off_t off, size_t len)
+{
+        struct flock fl;
+#if 0 /* Check they matched up locks and unlocks correctly. */
+        char line[80];
+        FILE *locks;
+        bool found = false;
+        locks = fopen("/proc/locks", "r");
+        while (fgets(line, 80, locks)) {
+                char *p;
+                int type, start, l;
+                /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
+                p = strchr(line, ':') + 1;
+                if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
+                        continue;
+                p += strlen(" FLOCK  ADVISORY  ");
+                if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
+                        type = F_RDLCK;
+                else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
+                        type = F_WRLCK;
+                else
+                        abort();
+                p += 6;
+                if (atoi(p) != getpid())
+                        continue;
+                p = strchr(strchr(p, ' ') + 1, ' ') + 1;
+                start = atoi(p);
+                p = strchr(p, ' ') + 1;
+                if (strncmp(p, "EOF", 3) == 0)
+                        l = 0;
+                else
+                        l = atoi(p) - start + 1;
+                if (off == start) {
+                        if (len != l) {
+                                fprintf(stderr, "Len %u should be %u: %s",
+                                        (int)len, l, line);
+                                abort();
+                        }
+                        if (type != rw) {
+                                fprintf(stderr, "Type %s wrong: %s",
+                                        rw == F_RDLCK ? "READ" : "WRITE", line);
+                                abort();
+                        }
+                        found = true;
+                        break;
+                }
+                TDB_LOG(( tdb, TDB_DEBUG_TRACE, "tdb_brlock pid %X, failed (fd=%d) at offset %d rw_type=%d lck_type=%d len=%d\n",
+                         getpid(), tdb->fd, offset, rw_type, lck_type, (int)len));
+                tdb->ecode = TDB_ERR_LOCK;
+                return -1;
+        }
+        TDB_LOG(( tdb, TDB_DEBUG_TRACE, "tdb_brlock pid %X, fd %d, lck_type %s, rw_type %s, offset %d, len %d DONE\n",
+                getpid(), tdb->fd, lock_type(lck_type), read_type(rw_type), offset, len));
+        }
+        if (!found) {
+                fprintf(stderr, "Unlock on %u@%u not found!\n",
+                        (int)off, (int)len);
+                abort();
+        }
+        fclose(locks);
+#endif
+        fl.l_type = F_UNLCK;
+        fl.l_whence = SEEK_SET;
+        fl.l_start = off;
+        fl.l_len = len;
+        fl.l_pid = 0;
+#ifdef __OS2__
+        return fcntl_lock(tdb, F_UNLCK, off, len, 1);
 #else
+        struct flock fl;
+        return fcntl(tdb->fd, F_SETLKW, &fl);
+#endif
+}
+/* list -1 is the alloc list, otherwise a hash chain. */
+static tdb_off_t lock_offset(int list)
+{
+        return FREELIST_TOP + 4*list;
+}
+/* a byte range locking function - return 0 on success
+   this functions locks/unlocks 1 byte at the specified offset.
+   On error, errno is also set so that errors are passed back properly
+   through tdb_open().
+   note that a len of zero means lock to end of file
+*/
+int tdb_brlock(struct tdb_context *tdb,
+               int rw_type, tdb_off_t offset, size_t len,
+               enum tdb_lock_flags flags)
+{
         int ret;
         if (tdb->flags & TDB_NOLOCK) {
+                return 0;
+        }
+        if (flags & TDB_LOCK_MARK_ONLY) {
                 return 0;
+        }
 …
+        }
-        fl.l_type = rw_type;
-        fl.l_whence = SEEK_SET;
-        fl.l_start = offset;
-        fl.l_len = len;
-        fl.l_pid = 0;
         do {
                 ret = fcntl(tdb->fd,lck_type,&fl);
+                ret = fcntl_lock(tdb, rw_type, offset, len,
+                                 flags & TDB_LOCK_WAIT);
                 /* Check for a sigalarm break. */
                 if (ret == -1 && errno == EINTR &&
 …
                  * EAGAIN is an expected return from non-blocking
                  * locks. */
                 if (!probe && lck_type != F_SETLK) {
                         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d len=%d\n",
                                  tdb->fd, offset, rw_type, lck_type, (int)len));
+                if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
+                        TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d flags=%d len=%d\n",
+                                 tdb->fd, offset, rw_type, flags, (int)len));
+                }
                 return -1;
+        }
         return 0;
+#endif
+}
+}
+int tdb_brunlock(struct tdb_context *tdb,
+                 int rw_type, tdb_off_t offset, size_t len)
+{
+        int ret;
+        if (tdb->flags & TDB_NOLOCK) {
+                return 0;
+        }
+        do {
+                ret = fcntl_unlock(tdb, rw_type, offset, len);
+        } while (ret == -1 && errno == EINTR);
+        if (ret == -1) {
+                TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %d rw_type=%d len=%d\n",
+                         tdb->fd, offset, rw_type, (int)len));
+        }
+        return ret;
+}
 /*
 …
   made. For those OSes we may loop for a while.
 */
 int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len)
+int tdb_allrecord_upgrade(struct tdb_context *tdb)
+{
         int count = 1000;
+        if (tdb->allrecord_lock.count != 1) {
+                TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                         "tdb_allrecord_upgrade failed: count %u too high\n",
+                         tdb->allrecord_lock.count));
+                return -1;
+        }
+        if (tdb->allrecord_lock.off != 1) {
+                TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                         "tdb_allrecord_upgrade failed: already upgraded?\n"));
+                return -1;
+        }
         while (count--) {
                 struct timeval tv;
 #ifdef __OS2__
                 // YD we cannot upgrade without an unlock first...
                 tdb_brlock(tdb, offset, F_UNLCK, F_SETLKW, 1, len);
+                tdb_brlock(tdb, F_UNLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT|TDB_LOCK_PROBE);
 #endif
+                if (tdb_brlock(tdb, offset, F_WRLCK, F_SETLKW, 1, len) == 0) {
+                if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0,
+                               TDB_LOCK_WAIT|TDB_LOCK_PROBE) == 0) {
+                        tdb->allrecord_lock.ltype = F_WRLCK;
+                        tdb->allrecord_lock.off = 0;
                         return 0;
+                }
 …
                 select(0, NULL, NULL, NULL, &tv);
+        }
         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock_upgrade failed at offset %d\n", offset));
+        TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
         return -1;
+}
+/* lock a list in the database. list -1 is the alloc list */
+static int _tdb_lock(struct tdb_context *tdb, int list, int ltype, int op)
+static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
+                                           tdb_off_t offset)
+{
+        unsigned int i;
+        for (i=0; i<tdb->num_lockrecs; i++) {
+                if (tdb->lockrecs[i].off == offset) {
+                        return &tdb->lockrecs[i];
+                }
+        }
+        return NULL;
+}
+/* lock an offset in the database. */
+int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
+                  enum tdb_lock_flags flags)
+{
         struct tdb_lock_type *new_lck;
+        int i;
+        bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
+        ltype &= ~TDB_MARK_LOCK;
+        /* a global lock allows us to avoid per chain locks */
+        if (tdb->global_lock.count &&
+            (ltype == tdb->global_lock.ltype || ltype == F_RDLCK)) {
+                return 0;
+        }
+        if (tdb->global_lock.count) {
+        if (offset >= lock_offset(tdb->header.hash_size)) {
                 tdb->ecode = TDB_ERR_LOCK;
+                return -1;
+        }
+        if (list < -1 || list >= (int)tdb->header.hash_size) {
+                tdb->ecode = TDB_ERR_LOCK;
+                TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid list %d for ltype=%d\n",
+                           list, ltype));
+                TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n",
+                         offset, ltype));
                 return -1;
+        }
 …
                 return 0;
+        for (i=0; i<tdb->num_lockrecs; i++) {
+                if (tdb->lockrecs[i].list == list) {
+                        if (tdb->lockrecs[i].count == 0) {
+                                /*
+                                 * Can't happen, see tdb_unlock(). It should
+                                 * be an assert.
+                                 */
+                                TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock: "
+                                         "lck->count == 0 for list %d", list));
+                        }
+                        /*
+                         * Just increment the in-memory struct, posix locks
+                         * don't stack.
+                         */
+                        tdb->lockrecs[i].count++;
+                        return 0;
+                }
+        new_lck = find_nestlock(tdb, offset);
+        if (new_lck) {
+                /*
+                 * Just increment the in-memory struct, posix locks
+                 * don't stack.
+                 */
+                new_lck->count++;
+                return 0;
+        }
 …
         /* Since fcntl locks don't nest, we do a lock for the first one,
            and simply bump the count for future ones */
+        if (!mark_lock &&
+            tdb->methods->tdb_brlock(tdb,FREELIST_TOP+4*list, ltype, op,
+, 1)) {
+                return -1;
+        }
+        tdb->num_locks++;
+        tdb->lockrecs[tdb->num_lockrecs].list = list;
+        if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
+                return -1;
+        }
+        tdb->lockrecs[tdb->num_lockrecs].off = offset;
         tdb->lockrecs[tdb->num_lockrecs].count = 1;
         tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
         tdb->num_lockrecs += 1;
+        tdb->num_lockrecs++;
         return 0;
+}
+static int tdb_lock_and_recover(struct tdb_context *tdb)
+{
+        int ret;
+        /* We need to match locking order in transaction commit. */
+        if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
+                return -1;
+        }
+        if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
+                tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
+                return -1;
+        }
+        ret = tdb_transaction_recover(tdb);
+        tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
+        tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
+        return ret;
+}
+static bool have_data_locks(const struct tdb_context *tdb)
+{
+        unsigned int i;
+        for (i = 0; i < tdb->num_lockrecs; i++) {
+                if (tdb->lockrecs[i].off >= lock_offset(-1))
+                        return true;
+        }
+        return false;
+}
+static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype,
+                         enum tdb_lock_flags waitflag)
+{
+        int ret;
+        bool check = false;
+        /* a allrecord lock allows us to avoid per chain locks */
+        if (tdb->allrecord_lock.count &&
+            (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
+                return 0;
+        }
+        if (tdb->allrecord_lock.count) {
+                tdb->ecode = TDB_ERR_LOCK;
+                ret = -1;
+        } else {
+                /* Only check when we grab first data lock. */
+                check = !have_data_locks(tdb);
+                ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag);
+                if (ret == 0 && check && tdb_needs_recovery(tdb)) {
+                        tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
+                        if (tdb_lock_and_recover(tdb) == -1) {
+                                return -1;
+                        }
+                        return tdb_lock_list(tdb, list, ltype, waitflag);
+                }
+        }
+        return ret;
+}
 …
+{
         int ret;
+        ret = _tdb_lock(tdb, list, ltype, F_SETLKW);
+        ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
         if (ret) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
 …
 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
+{
+        return _tdb_lock(tdb, list, ltype, F_SETLK);
+}
+/* unlock the database: returns void because it's too late for errors. */
+        /* changed to return int it may be interesting to know there
+           has been an error  --simo */
+int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
+        return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT);
+}
+int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
+                    bool mark_lock)
+{
         int ret = -1;
+        int i;
+        struct tdb_lock_type *lck = NULL;
+        bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
+        ltype &= ~TDB_MARK_LOCK;
+        /* a global lock allows us to avoid per chain locks */
+        if (tdb->global_lock.count &&
+            (ltype == tdb->global_lock.ltype || ltype == F_RDLCK)) {
+                return 0;
+        }
+        if (tdb->global_lock.count) {
+                tdb->ecode = TDB_ERR_LOCK;
+                return -1;
+        }
+        struct tdb_lock_type *lck;
         if (tdb->flags & TDB_NOLOCK)
 …
         /* Sanity checks */
         if (list < -1 || list >= (int)tdb->header.hash_size) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
+        if (offset >= lock_offset(tdb->header.hash_size)) {
+                TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->header.hash_size));
                 return ret;
+        }
+        for (i=0; i<tdb->num_lockrecs; i++) {
+                if (tdb->lockrecs[i].list == list) {
+                        lck = &tdb->lockrecs[i];
+                        break;
+                }
+        }
+        lck = find_nestlock(tdb, offset);
         if ((lck == NULL) || (lck->count == 0)) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
 …
                 ret = 0;
         } else {
+                ret = tdb->methods->tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK,
+                                               F_SETLKW, 0, 1);
+        }
+        tdb->num_locks--;
+                ret = tdb_brunlock(tdb, ltype, offset, 1);
+        }
         /*
 …
          * last array element.
          */
+        if (tdb->num_lockrecs > 1) {
+                *lck = tdb->lockrecs[tdb->num_lockrecs-1];
+        }
+        tdb->num_lockrecs -= 1;
+        *lck = tdb->lockrecs[--tdb->num_lockrecs];
         /*
 …
+}
+int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
+{
+        /* a global lock allows us to avoid per chain locks */
+        if (tdb->allrecord_lock.count &&
+            (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
+                return 0;
+        }
+        if (tdb->allrecord_lock.count) {
+                tdb->ecode = TDB_ERR_LOCK;
+                return -1;
+        }
+        return tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
+}
 /*
   get the transaction lock
  */
+int tdb_transaction_lock(struct tdb_context *tdb, int ltype)
+{
+        if (tdb->global_lock.count) {
+                return 0;
+        }
+        if (tdb->transaction_lock_count > 0) {
+                tdb->transaction_lock_count++;
+                return 0;
+        }
+        if (tdb->methods->tdb_brlock(tdb, TRANSACTION_LOCK, ltype,
+                                     F_SETLKW, 0, 1) == -1) {
+                TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_lock: failed to get transaction lock\n"));
+                tdb->ecode = TDB_ERR_LOCK;
+                return -1;
+        }
+        tdb->transaction_lock_count++;
+        return 0;
+int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
+                         enum tdb_lock_flags lockflags)
+{
+        return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
+}
 …
   release the transaction lock
  */
+int tdb_transaction_unlock(struct tdb_context *tdb)
+{
+        int ret;
+        if (tdb->global_lock.count) {
+                return 0;
+        }
+        if (tdb->transaction_lock_count > 1) {
+                tdb->transaction_lock_count--;
+                return 0;
+        }
+        ret = tdb->methods->tdb_brlock(tdb, TRANSACTION_LOCK, F_UNLCK, F_SETLKW, 0, 1);
+        if (ret == 0) {
+                tdb->transaction_lock_count = 0;
+        }
+        return ret;
+}
+/* lock/unlock entire database */
+static int _tdb_lockall(struct tdb_context *tdb, int ltype, int op)
+{
+        bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
+        ltype &= ~TDB_MARK_LOCK;
+int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
+{
+        return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
+}
+/* Returns 0 if all done, -1 if error, 1 if ok. */
+static int tdb_allrecord_check(struct tdb_context *tdb, int ltype,
+                               enum tdb_lock_flags flags, bool upgradable)
+{
         /* There are no locks on read-only dbs */
         if (tdb->read_only || tdb->traverse_read) {
 …
+        }
         if (tdb->global_lock.count && tdb->global_lock.ltype == ltype) {
                 tdb->global_lock.count++;
                 return 0;
+        }
         if (tdb->global_lock.count) {
+        if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) {
+                tdb->allrecord_lock.count++;
+                return 0;
+        }
+        if (tdb->allrecord_lock.count) {
                 /* a global lock of a different type exists */
                 tdb->ecode = TDB_ERR_LOCK;
                 return -1;
+        }
         if (tdb->num_locks != 0) {
+        if (tdb_have_extra_locks(tdb)) {
                 /* can't combine global and chain locks */
                 tdb->ecode = TDB_ERR_LOCK;
 …
+        }
+        if (!mark_lock &&
+            tdb->methods->tdb_brlock(tdb, FREELIST_TOP, ltype, op,
+, 4*tdb->header.hash_size)) {
+                if (op == F_SETLKW) {
+                        TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lockall failed (%s)\n", strerror(errno)));
+        if (upgradable && ltype != F_RDLCK) {
+                /* tdb error: you can't upgrade a write lock! */
+                tdb->ecode = TDB_ERR_LOCK;
+                return -1;
+        }
+        return 1;
+}
+/* We only need to lock individual bytes, but Linux merges consecutive locks
+ * so we lock in contiguous ranges. */
+static int tdb_chainlock_gradual(struct tdb_context *tdb,
+                                 int ltype, enum tdb_lock_flags flags,
+                                 size_t off, size_t len)
+{
+        int ret;
+        enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
+        if (len <= 4) {
+                /* Single record.  Just do blocking lock. */
+                return tdb_brlock(tdb, ltype, off, len, flags);
+        }
+        /* First we try non-blocking. */
+        ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
+        if (ret == 0) {
+                return 0;
+        }
+        /* Try locking first half, then second. */
+        ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2);
+        if (ret == -1)
+                return -1;
+        ret = tdb_chainlock_gradual(tdb, ltype, flags,
+                                    off + len / 2, len - len / 2);
+        if (ret == -1) {
+                tdb_brunlock(tdb, ltype, off, len / 2);
+                return -1;
+        }
+        return 0;
+}
+/* lock/unlock entire database.  It can only be upgradable if you have some
+ * other way of guaranteeing exclusivity (ie. transaction write lock).
+ * We do the locking gradually to avoid being starved by smaller locks. */
+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+                       enum tdb_lock_flags flags, bool upgradable)
+{
+        switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
+        case -1:
+                return -1;
+        case 0:
+                return 0;
+        }
+        /* We cover two kinds of locks:
+         * 1) Normal chain locks.  Taken for almost all operations.
+         * 3) Individual records locks.  Taken after normal or free
+         *    chain locks.
+         *
+         * It is (1) which cause the starvation problem, so we're only
+         * gradual for that. */
+        if (tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
+                                  tdb->header.hash_size * 4) == -1) {
+                return -1;
+        }
+        /* Grab individual record locks. */
+        if (tdb_brlock(tdb, ltype, lock_offset(tdb->header.hash_size), 0,
+                       flags) == -1) {
+                tdb_brunlock(tdb, ltype, FREELIST_TOP,
+                             tdb->header.hash_size * 4);
+                return -1;
+        }
+        tdb->allrecord_lock.count = 1;
+        /* If it's upgradable, it's actually exclusive so we can treat
+         * it as a write lock. */
+        tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
+        tdb->allrecord_lock.off = upgradable;
+        if (tdb_needs_recovery(tdb)) {
+                bool mark = flags & TDB_LOCK_MARK_ONLY;
+                tdb_allrecord_unlock(tdb, ltype, mark);
+                if (mark) {
+                        tdb->ecode = TDB_ERR_LOCK;
+                        TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                                 "tdb_lockall_mark cannot do recovery\n"));
+                        return -1;
+                }
                 return -1;
+        }
         tdb->global_lock.count = 1;
         tdb->global_lock.ltype = ltype;
+                if (tdb_lock_and_recover(tdb) == -1) {
+                        return -1;
+                }
+                return tdb_allrecord_lock(tdb, ltype, flags, upgradable);
+        }
         return 0;
 …
 /* unlock entire db */
+static int _tdb_unlockall(struct tdb_context *tdb, int ltype)
+{
+        bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
+        ltype &= ~TDB_MARK_LOCK;
+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
+{
         /* There are no locks on read-only dbs */
         if (tdb->read_only || tdb->traverse_read) {
 …
+        }
         if (tdb->global_lock.ltype != ltype || tdb->global_lock.count == 0) {
+        if (tdb->allrecord_lock.count == 0) {
                 tdb->ecode = TDB_ERR_LOCK;
                 return -1;
+        }
+        if (tdb->global_lock.count > 1) {
+                tdb->global_lock.count--;
+                return 0;
+        }
+        if (!mark_lock &&
+            tdb->methods->tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW,
+, 4*tdb->header.hash_size)) {
+        /* Upgradable locks are marked as write locks. */
+        if (tdb->allrecord_lock.ltype != ltype
+            && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
+                tdb->ecode = TDB_ERR_LOCK;
+                return -1;
+        }
+        if (tdb->allrecord_lock.count > 1) {
+                tdb->allrecord_lock.count--;
+                return 0;
+        }
+        if (!mark_lock && tdb_brunlock(tdb, ltype, FREELIST_TOP, 0)) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno)));
                 return -1;
+        }
         tdb->global_lock.count = 0;
         tdb->global_lock.ltype = 0;
+        tdb->allrecord_lock.count = 0;
+        tdb->allrecord_lock.ltype = 0;
         return 0;
 …
 /* lock entire database with write lock */
 int tdb_lockall(struct tdb_context *tdb)
+_PUBLIC_ int tdb_lockall(struct tdb_context *tdb)
+{
         tdb_trace(tdb, "tdb_lockall");
         return _tdb_lockall(tdb, F_WRLCK, F_SETLKW);
+        return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
+}
 /* lock entire database with write lock - mark only */
 int tdb_lockall_mark(struct tdb_context *tdb)
+_PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb)
+{
         tdb_trace(tdb, "tdb_lockall_mark");
         return _tdb_lockall(tdb, F_WRLCK | TDB_MARK_LOCK, F_SETLKW);
+        return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false);
+}
 /* unlock entire database with write lock - unmark only */
 int tdb_lockall_unmark(struct tdb_context *tdb)
+_PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb)
+{
         tdb_trace(tdb, "tdb_lockall_unmark");
         return _tdb_unlockall(tdb, F_WRLCK | TDB_MARK_LOCK);
+        return tdb_allrecord_unlock(tdb, F_WRLCK, true);
+}
 /* lock entire database with write lock - nonblocking varient */
 int tdb_lockall_nonblock(struct tdb_context *tdb)
+{
         int ret = _tdb_lockall(tdb, F_WRLCK, F_SETLK);
+_PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb)
+{
+        int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
         tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
         return ret;
 …
 /* unlock entire database with write lock */
 int tdb_unlockall(struct tdb_context *tdb)
+_PUBLIC_ int tdb_unlockall(struct tdb_context *tdb)
+{
         tdb_trace(tdb, "tdb_unlockall");
         return _tdb_unlockall(tdb, F_WRLCK);
+        return tdb_allrecord_unlock(tdb, F_WRLCK, false);
+}
 /* lock entire database with read lock */
 int tdb_lockall_read(struct tdb_context *tdb)
+_PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb)
+{
         tdb_trace(tdb, "tdb_lockall_read");
         return _tdb_lockall(tdb, F_RDLCK, F_SETLKW);
+        return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+}
 /* lock entire database with read lock - nonblock varient */
 int tdb_lockall_read_nonblock(struct tdb_context *tdb)
+{
         int ret = _tdb_lockall(tdb, F_RDLCK, F_SETLK);
+_PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb)
+{
+        int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
         tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
         return ret;
 …
 /* unlock entire database with read lock */
 int tdb_unlockall_read(struct tdb_context *tdb)
+_PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb)
+{
         tdb_trace(tdb, "tdb_unlockall_read");
         return _tdb_unlockall(tdb, F_RDLCK);
+        return tdb_allrecord_unlock(tdb, F_RDLCK, false);
+}
 /* lock/unlock one hash chain. This is meant to be used to reduce
    contention - it cannot guarantee how many records will be locked */
 int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
+_PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
+{
         int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 …
    to reduce contention - it cannot guarantee how many records will be
    locked */
 int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
+_PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
+{
         int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 …
 /* mark a chain as locked without actually locking it. Warning! use with great caution! */
+int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
+{
+        int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK | TDB_MARK_LOCK);
+_PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
+{
+        int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
+                                F_WRLCK, TDB_LOCK_MARK_ONLY);
         tdb_trace_1rec(tdb, "tdb_chainlock_mark", key);
         return ret;
 …
 /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
 int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
+_PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
+{
         tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key);
+        return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK | TDB_MARK_LOCK);
+}
+int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
+        return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
+                               F_WRLCK, true);
+}
+_PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
+{
         tdb_trace_1rec(tdb, "tdb_chainunlock", key);
 …
+}
 int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
+_PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
         int ret;
 …
+}
 int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
+_PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
         tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
 …
+}
 /* record lock stops delete underneath */
 int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
+{
         if (tdb->global_lock.count) {
                 return 0;
+        }
         return off ? tdb->methods->tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0, 1) : 0;
+        if (tdb->allrecord_lock.count) {
+                return 0;
+        }
+        return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
+}
 …
                 if (i->off == off)
                         return -1;
+        return tdb->methods->tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1, 1);
+}
+/*
+  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
+  an error to fail to get the lock here.
+*/
+        if (tdb->allrecord_lock.count) {
+                if (tdb->allrecord_lock.ltype == F_WRLCK) {
+                        return 0;
+                }
+                return -1;
+        }
+        return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
+}
 int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+        return tdb->methods->tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0, 1);
+        if (tdb->allrecord_lock.count) {
+                return 0;
+        }
+        return tdb_brunlock(tdb, F_WRLCK, off, 1);
+}
 …
         uint32_t count = 0;
         if (tdb->global_lock.count) {
+        if (tdb->allrecord_lock.count) {
                 return 0;
+        }
 …
                 if (i->off == off)
                         count++;
+        return (count == 1 ? tdb->methods->tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0, 1) : 0);
+}
+        return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
+}
+bool tdb_have_extra_locks(struct tdb_context *tdb)
+{
+        unsigned int extra = tdb->num_lockrecs;
+        /* A transaction holds the lock for all records. */
+        if (!tdb->transaction && tdb->allrecord_lock.count) {
+                return true;
+        }
+        /* We always hold the active lock if CLEAR_IF_FIRST. */
+        if (find_nestlock(tdb, ACTIVE_LOCK)) {
+                extra--;
+        }
+        /* In a transaction, we expect to hold the transaction lock */
+        if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) {
+                extra--;
+        }
+        return extra;
+}
+/* The transaction code uses this to remove all locks. */
+void tdb_release_transaction_locks(struct tdb_context *tdb)
+{
+        unsigned int i, active = 0;
+        if (tdb->allrecord_lock.count != 0) {
+                tdb_brunlock(tdb, tdb->allrecord_lock.ltype, FREELIST_TOP, 0);
+                tdb->allrecord_lock.count = 0;
+        }
+        for (i=0;i<tdb->num_lockrecs;i++) {
+                struct tdb_lock_type *lck = &tdb->lockrecs[i];
+                /* Don't release the active lock!  Copy it to first entry. */
+                if (lck->off == ACTIVE_LOCK) {
+                        tdb->lockrecs[active++] = *lck;
+                } else {
+                        tdb_brunlock(tdb, lck->ltype, lck->off, 1);
+                }
+        }
+        tdb->num_lockrecs = active;
+        if (tdb->num_lockrecs == 0) {
+                SAFE_FREE(tdb->lockrecs);
+        }
+}

trunk/server/lib/tdb/common/open.c

-              r456
+              r745
    Copyright (C) Paul `Rusty' Russell              2000
    Copyright (C) Jeremy Allison                    2000-2003
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
 static struct tdb_context *tdbs = NULL;
+/* This is based on the hash algorithm from gdbm */
+static unsigned int default_tdb_hash(TDB_DATA *key)
+{
+        uint32_t value; /* Used to compute the hash value.  */
+        uint32_t   i;   /* Used to cycle through random values. */
+        /* Set the initial value from the key size. */
+        for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
+                value = (value + (key->dptr[i] << (i*5 % 24)));
+        return (1103515243 * value + 12345);
+}
+/* We use two hashes to double-check they're using the right hash function. */
+void tdb_header_hash(struct tdb_context *tdb,
+                     uint32_t *magic1_hash, uint32_t *magic2_hash)
+{
+        TDB_DATA hash_key;
+        uint32_t tdb_magic = TDB_MAGIC;
+        hash_key.dptr = discard_const_p(unsigned char, TDB_MAGIC_FOOD);
+        hash_key.dsize = sizeof(TDB_MAGIC_FOOD);
+        *magic1_hash = tdb->hash_fn(&hash_key);
+        hash_key.dptr = (unsigned char *)CONVERT(tdb_magic);
+        hash_key.dsize = sizeof(tdb_magic);
+        *magic2_hash = tdb->hash_fn(&hash_key);
+        /* Make sure at least one hash is non-zero! */
+        if (*magic1_hash == 0 && *magic2_hash == 0)
+                *magic1_hash = 1;
+}
 /* initialise a new database with a specified hash size */
 …
         size_t size;
         int ret = -1;
-        ssize_t written;
         /* We make it up in memory, then write it out if not internal */
 …
         newdb->version = TDB_VERSION;
         newdb->hash_size = hash_size;
+        tdb_header_hash(tdb, &newdb->magic1_hash, &newdb->magic2_hash);
+        /* Make sure older tdbs (which don't check the magic hash fields)
+         * will refuse to open this TDB. */
+        if (tdb->flags & TDB_INCOMPATIBLE_HASH)
+                newdb->rwlocks = TDB_HASH_RWLOCK_MAGIC;
         if (tdb->flags & TDB_INTERNAL) {
                 tdb->map_size = size;
 …
         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
         /* we still have "ret == -1" here */
+        written = write(tdb->fd, newdb, size);
+        if (written == size) {
+        if (tdb_write_all(tdb->fd, newdb, size))
                 ret = 0;
-        } else if (written != -1) {
-                /* call write once again, this usually should return -1 and
-                 * set errno appropriately */
-                size -= written;
-                written = write(tdb->fd, newdb+written, size);
-                if (written == size) {
-                ret = 0;
-                } else if (written >= 0) {
-                        /* a second incomplete write - we give up.
-                         * guessing the errno... */
-                        errno = ENOSPC;
+                }
+        }
   fail:
 …
+{
         struct tdb_context *i;
         for (i = tdbs; i; i = i->next) {
                 if (i->device == device && i->inode == ino) {
 …
    @param name may be NULL for internal databases. */
 struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
+_PUBLIC_ struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
                       int open_flags, mode_t mode)
+{
 …
+}
+struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
+static bool check_header_hash(struct tdb_context *tdb,
+                              bool default_hash, uint32_t *m1, uint32_t *m2)
+{
+        tdb_header_hash(tdb, m1, m2);
+        if (tdb->header.magic1_hash == *m1 &&
+            tdb->header.magic2_hash == *m2) {
+                return true;
+        }
+        /* If they explicitly set a hash, always respect it. */
+        if (!default_hash)
+                return false;
+        /* Otherwise, try the other inbuilt hash. */
+        if (tdb->hash_fn == tdb_old_hash)
+                tdb->hash_fn = tdb_jenkins_hash;
+        else
+                tdb->hash_fn = tdb_old_hash;
+        return check_header_hash(tdb, false, m1, m2);
+}
+_PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
                                 int open_flags, mode_t mode,
                                 const struct tdb_logging_context *log_ctx,
 …
         uint32_t vertest;
         unsigned v;
+        const char *hash_alg;
+        uint32_t magic1, magic2;
         if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) {
 …
                 tdb->log.log_private = NULL;
+        }
+        tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
+        if (name == NULL && (tdb_flags & TDB_INTERNAL)) {
+                name = "__TDB_INTERNAL__";
+        }
+        if (name == NULL) {
+                tdb->name = discard_const_p(char, "__NULL__");
+                TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: called with name == NULL\n"));
+                tdb->name = NULL;
+                errno = EINVAL;
+                goto fail;
+        }
+        /* now make a copy of the name, as the caller memory might went away */
+        if (!(tdb->name = (char *)strdup(name))) {
+                /*
+                 * set the name as the given string, so that tdb_name() will
+                 * work in case of an error.
+                 */
+                tdb->name = discard_const_p(char, name);
+                TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't strdup(%s)\n",
+                         name));
+                tdb->name = NULL;
+                errno = ENOMEM;
+                goto fail;
+        }
+        if (hash_fn) {
+                tdb->hash_fn = hash_fn;
+                hash_alg = "the user defined";
+        } else {
+                /* This controls what we use when creating a tdb. */
+                if (tdb->flags & TDB_INCOMPATIBLE_HASH) {
+                        tdb->hash_fn = tdb_jenkins_hash;
+                } else {
+                        tdb->hash_fn = tdb_old_hash;
+                }
+                hash_alg = "either default";
+        }
 #ifdef __OS2__
 …
+        }
 #endif
         /* cache the page size */
         tdb->page_size = getpagesize();
 …
                 goto fail;
+        }
         if (hash_size == 0)
                 hash_size = DEFAULT_HASH_SIZE;
 …
+        }
+        if (getenv("TDB_NO_FSYNC")) {
+                tdb->flags |= TDB_NOSYNC;
+        }
         /*
          * TDB_ALLOW_NESTING is the default behavior.
 …
         /* ensure there is only one process initialising at once */
         if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get global lock on %s: %s\n",
+        if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
+                TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get open lock on %s: %s\n",
                          name, strerror(errno)));
                 goto fail;      /* errno set by tdb_brlock */
 …
         /* we need to zero database if we are the only one with it open */
         if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
-            (!tdb->read_only)
 #ifndef __OS2__
+            && (locked = (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0, 1) == 0))
+#endif
+            ) {
+            (!tdb->read_only) &&
+            (locked = (tdb_nest_lock(tdb, ACTIVE_LOCK, F_WRLCK, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE) == 0))) {
+#else
+            (!tdb->read_only) ) {
+#endif
                 open_flags |= O_CREAT;
                 if (ftruncate(tdb->fd, 0) == -1) {
 …
                 goto fail;
+        if (tdb->header.rwlocks != 0) {
+        if (tdb->header.rwlocks != 0 &&
+            tdb->header.rwlocks != TDB_HASH_RWLOCK_MAGIC) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks no longer supported\n"));
+                goto fail;
+        }
+        if ((tdb->header.magic1_hash == 0) && (tdb->header.magic2_hash == 0)) {
+                /* older TDB without magic hash references */
+                tdb->hash_fn = tdb_old_hash;
+        } else if (!check_header_hash(tdb, !hash_fn, &magic1, &magic2)) {
+                TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+                         "%s was not created with %s hash function we are using\n"
+                         "magic1_hash[0x%08X %s 0x%08X] "
+                         "magic2_hash[0x%08X %s 0x%08X]\n",
+                         name, hash_alg,
+                         tdb->header.magic1_hash,
+                         (tdb->header.magic1_hash == magic1) ? "==" : "!=",
+                         magic1,
+                         tdb->header.magic2_hash,
+                         (tdb->header.magic2_hash == magic2) ? "==" : "!=",
+                         magic2));
+                errno = EINVAL;
                 goto fail;
+        }
 …
+        }
-        if (!(tdb->name = (char *)strdup(name))) {
-                errno = ENOMEM;
-                goto fail;
+        }
         tdb->map_size = st.st_size;
         tdb->device = st.st_dev;
 …
         if (locked) {
 #ifndef __OS2__
                 if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0, 1) == -1) {
+                if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
                         TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
                                  "failed to take ACTIVE_LOCK on %s: %s\n",
+                                 "failed to release ACTIVE_LOCK on %s: %s\n",
                                  name, strerror(errno)));
                         goto fail;
+                }
 #endif
+        }
 …
         if (tdb_flags & TDB_CLEAR_IF_FIRST) {
                 /* leave this lock in place to indicate it's in use */
                 if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0, 1) == -1)
+                if (tdb_nest_lock(tdb, ACTIVE_LOCK, F_RDLCK, TDB_LOCK_WAIT) == -1) {
                         goto fail;
+                }
+        }
 #endif
 …
 #ifdef __OS2__
         // YD internal databases do not get global lock!
         if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1) == -1)
+        if (tdb_nest_unlock(tdb, OPEN_LOCK, F_WRLCK, false) == -1)
                 goto fail;
 #endif
 …
         /* Internal (memory-only) databases skip all the code above to
          * do with disk files, and resume here by releasing their
          * global lock and hooking into the active list. */
+         * open lock and hooking into the active list. */
 #ifndef __OS2__
+        if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1) == -1)
+                goto fail;
+        if (tdb_nest_unlock(tdb, OPEN_LOCK, F_WRLCK, false) == -1) {
+                goto fail;
+        }
 #endif
         tdb->next = tdbs;
                 tdbs = tdb;
+        tdbs = tdb;
         return tdb;
 …
                         tdb_munmap(tdb);
+        }
+        SAFE_FREE(tdb->name);
+        if (tdb->fd != -1)
+                if (close(tdb->fd) != 0)
+                        TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to close tdb->fd on error!\n"));
 #ifdef __OS2__
         DosCloseMutexSem( tdb->hGlobalLock);
 …
         tdb->hTransactionLock = 0;
 #endif
+        if (tdb->fd != -1)
+                if (close(tdb->fd) != 0)
+                        TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to close tdb->fd on error!\n"));
+        SAFE_FREE(tdb->lockrecs);
+        SAFE_FREE(tdb->name);
         SAFE_FREE(tdb);
         errno = save_errno;
 …
  */
 void tdb_set_max_dead(struct tdb_context *tdb, int max_dead)
+_PUBLIC_ void tdb_set_max_dead(struct tdb_context *tdb, int max_dead)
+{
         tdb->max_dead_records = max_dead;
 …
  * @returns -1 for error; 0 for success.
  **/
 int tdb_close(struct tdb_context *tdb)
+_PUBLIC_ int tdb_close(struct tdb_context *tdb)
+{
         struct tdb_context **i;
         int ret = 0;
+        if (tdb->transaction) {
+                tdb_transaction_cancel(tdb);
+        }
         tdb_trace(tdb, "tdb_close");
-        if (tdb->transaction) {
-                _tdb_transaction_cancel(tdb);
+        }
         if (tdb->map_ptr) {
 …
         // YD internal databases do not have a global lock
         if (!(tdb->flags & TDB_INTERNAL))
                 tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLK, 0, 1);
+        tdb_nest_unlock(tdb, OPEN_LOCK, F_WRLCK, false);
 #endif
         if (tdb->fd != -1) {
 …
 /* register a loging function */
 void tdb_set_logging_function(struct tdb_context *tdb,
                               const struct tdb_logging_context *log_ctx)
+_PUBLIC_ void tdb_set_logging_function(struct tdb_context *tdb,
+                                       const struct tdb_logging_context *log_ctx)
+{
         tdb->log = *log_ctx;
+}
 void *tdb_get_logging_private(struct tdb_context *tdb)
+_PUBLIC_ void *tdb_get_logging_private(struct tdb_context *tdb)
+{
         return tdb->log.log_private;
 …
 #endif
         if (tdb->num_locks != 0 || tdb->global_lock.count) {
+        if (tdb_have_extra_locks(tdb)) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed with locks held\n"));
                 goto fail;
 …
 #endif /* fake pread or pwrite */
+        if (active_lock &&
+            (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0, 1) == -1)) {
+        /* We may still think we hold the active lock. */
+        tdb->num_lockrecs = 0;
+        SAFE_FREE(tdb->lockrecs);
+        if (active_lock && tdb_nest_lock(tdb, ACTIVE_LOCK, F_RDLCK, TDB_LOCK_WAIT) == -1) {
                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: failed to obtain active lock\n"));
                 goto fail;
 …
 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
    seek pointer from our parent and to re-establish locks */
 int tdb_reopen(struct tdb_context *tdb)
+_PUBLIC_ int tdb_reopen(struct tdb_context *tdb)
+{
         return tdb_reopen_internal(tdb, tdb->flags & TDB_CLEAR_IF_FIRST);
 …
 /* reopen all tdb's */
 int tdb_reopen_all(int parent_longlived)
+_PUBLIC_ int tdb_reopen_all(int parent_longlived)
+{
         struct tdb_context *tdb;
 …
                 // extract path info
                 _splitpath( name, drive, dir, fname, ext);
                 sprintf( szSem, "\\SEM32\\TDB_GL_%s%s%s%i", dir, fname, ext, global_Sem32Add);
+                sprintf( szSem, "\\SEM32\\TDB_OL_%s%s%s%i", dir, fname, ext, global_Sem32Add);
                 rc = DosCreateMutexSem( szSem, &tdb->hGlobalLock, 0, FALSE);
                 if (rc == ERROR_DUPLICATE_NAME)
 …
                         return -1;
+                }
                 TDB_LOG((tdb, TDB_DEBUG_TRACE,"%s pid %d global handle %d\n", caller, getpid(), tdb->hGlobalLock));
+                TDB_LOG((tdb, TDB_DEBUG_TRACE,"%s pid %d open handle %d\n", caller, getpid(), tdb->hGlobalLock));
                 sprintf( szSem, "\\SEM32\\TDB_AL_%s%s%s%i", dir, fname, ext, global_Sem32Add);
 …
+}
 #endif

trunk/server/lib/tdb/common/tdb.c

-              r648
+              r745
    Copyright (C) Paul `Rusty' Russell              2000
    Copyright (C) Jeremy Allison                    2000-2003
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
 #include "tdb_private.h"
 TDB_DATA tdb_null;
+_PUBLIC_ TDB_DATA tdb_null;
 /*
 …
   the TDB_SEQNUM flag
 */
 void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
+_PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
+{
         tdb_off_t seqnum=0;
         if (!(tdb->flags & TDB_SEQNUM)) {
                 return;
 …
+        }
+        if (tdb_brlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, F_SETLKW, 1, 1) != 0) {
+        if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
+                          TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
                 return;
+        }
 …
         tdb_increment_seqnum_nonblock(tdb);
         tdb_brlock(tdb, TDB_SEQNUM_OFS, F_UNLCK, F_SETLKW, 1, 1);
+        tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
+}
 …
+{
         tdb_off_t rec_ptr;
         /* read in the hash top */
         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 …
+                }
+        }
         /* must be long enough key, data and tailer */
 …
                 return tdb_rec_write(tdb, rec_ptr, &rec);
+        }
         return 0;
+}
 …
+}
 TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
+_PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
+{
         TDB_DATA ret = _tdb_fetch(tdb, key);
 …
  * should be fast and should not block on other syscalls.
+ *
  * DONT CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
+ * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
+ *
  * For mmapped tdb's that do not have a transaction open it points the parsing
 …
  * This is interesting for all readers of potentially large data structures in
  * the tdb records, ldb indexes being one example.
+ *
+ * Return -1 if the record was not found.
  */
 int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
+_PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
                      int (*parser)(TDB_DATA key, TDB_DATA data,
                                    void *private_data),
 …
         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
+                /* record not found */
                 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
                 tdb->ecode = TDB_ERR_NOEXIST;
                 return 0;
+                return -1;
+        }
         tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
 …
+{
         struct tdb_record rec;
         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
                 return 0;
 …
+}
 int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
+_PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
+{
         uint32_t hash = tdb->hash_fn(&key);
 …
         tdb_off_t rec_ptr;
         struct tdb_record rec;
         /* read in the hash top */
         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 …
                 return -1;
+        }
         /* read in the hash top */
         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 …
+}
 int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
+_PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
+{
         uint32_t hash = tdb->hash_fn(&key);
 …
+{
         tdb_off_t rec_ptr;
         /* read in the hash top */
         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 …
    return 0 on success, -1 on failure
 */
 int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
+_PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
+{
         uint32_t hash;
 …
 /* Append to an entry. Create if not exist. */
 int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
+_PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
+{
         uint32_t hash;
 …
         ret = _tdb_store(tdb, key, dbuf, 0, hash);
         tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
 failed:
         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 …
   useful for external logging functions
 */
 const char *tdb_name(struct tdb_context *tdb)
+_PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
+{
         return tdb->name;
 …
   of the fd
 */
 int tdb_fd(struct tdb_context *tdb)
+_PUBLIC_ int tdb_fd(struct tdb_context *tdb)
+{
         return tdb->fd;
 …
   useful for external tdb routines that wish to log tdb errors
 */
 tdb_log_func tdb_log_fn(struct tdb_context *tdb)
+_PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
+{
         return tdb->log.log_fn;
 …
   test of a possible tdb change.
 */
 int tdb_get_seqnum(struct tdb_context *tdb)
+_PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
+{
         tdb_off_t seqnum=0;
 …
+}
 int tdb_hash_size(struct tdb_context *tdb)
+_PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
+{
         return tdb->header.hash_size;
+}
 size_t tdb_map_size(struct tdb_context *tdb)
+_PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
+{
         return tdb->map_size;
+}
 int tdb_get_flags(struct tdb_context *tdb)
+_PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
+{
         return tdb->flags;
+}
 void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
+_PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
+{
         if ((flags & TDB_ALLOW_NESTING) &&
 …
+}
 void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
+_PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
+{
         if ((flags & TDB_ALLOW_NESTING) &&
 …
   enable sequence number handling on an open tdb
 */
 void tdb_enable_seqnum(struct tdb_context *tdb)
+_PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
+{
         tdb->flags |= TDB_SEQNUM;
 …
 /*
   wipe the entire database, deleting all records. This can be done
   very fast by using a global lock. The entire data portion of the
+  very fast by using a allrecord lock. The entire data portion of the
   file becomes a single entry in the freelist.
   This code carefully steps around the recovery area, leaving it alone
  */
 int tdb_wipe_all(struct tdb_context *tdb)
+_PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
+{
         int i;
 …
   repack a tdb
  */
 int tdb_repack(struct tdb_context *tdb)
+_PUBLIC_ int tdb_repack(struct tdb_context *tdb)
+{
         struct tdb_context *tmp_db;
 …
+}
+/* Even on files, we can get partial writes due to signals. */
+bool tdb_write_all(int fd, const void *buf, size_t count)
+{
+        while (count) {
+                ssize_t ret;
+                ret = write(fd, buf, count);
+                if (ret < 0)
+                        return false;
+                buf = (const char *)buf + ret;
+                count -= ret;
+        }
+        return true;
+}
 #ifdef TDB_TRACE
 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
+{
         if (write(tdb->tracefd, str, strlen(str)) != strlen(str)) {
+        if (!tdb_write_alltdb->tracefd, str, strlen(str)) {
                 close(tdb->tracefd);
                 tdb->tracefd = -1;

trunk/server/lib/tdb/common/tdb_private.h

-              r657
+              r745
    Copyright (C) Andrew Tridgell              2005
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
    License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */
 #ifdef __OS2__
 #define INCL_ERRORS
 …
 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
 #define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
+#define TDB_RECOVERY_INVALID_MAGIC (0x0)
+#define TDB_HASH_RWLOCK_MAGIC (0xbad1a51U)
 #define TDB_ALIGNMENT 4
 #define DEFAULT_HASH_SIZE 131
 …
 /* lock offsets */
 #define GLOBAL_LOCK      0
+#define OPEN_LOCK        0
 #define ACTIVE_LOCK      4
 #define TRANSACTION_LOCK 8
 …
         tdb_off_t recovery_start; /* offset of transaction recovery region */
         tdb_off_t sequence_number; /* used when TDB_SEQNUM is set */
+        tdb_off_t reserved[29];
+        uint32_t magic1_hash; /* hash of TDB_MAGIC_FOOD. */
+        uint32_t magic2_hash; /* hash of TDB_MAGIC. */
+        tdb_off_t reserved[27];
 };
 struct tdb_lock_type {
         int list;
+        uint32_t off;
         uint32_t count;
         uint32_t ltype;
 …
 };
+enum tdb_lock_flags {
+        /* WAIT == F_SETLKW, NOWAIT == F_SETLK */
+        TDB_LOCK_NOWAIT = 0,
+        TDB_LOCK_WAIT = 1,
+        /* If set, don't log an error on failure. */
+        TDB_LOCK_PROBE = 2,
+        /* If set, don't actually lock at all. */
+        TDB_LOCK_MARK_ONLY = 4,
+};
 struct tdb_methods {
 …
         int (*tdb_oob)(struct tdb_context *, tdb_off_t , int );
         int (*tdb_expand_file)(struct tdb_context *, tdb_off_t , tdb_off_t );
-        int (*tdb_brlock)(struct tdb_context *, tdb_off_t , int, int, int, size_t);
 };
 …
         int traverse_read; /* read-only traversal */
         int traverse_write; /* read-write traversal */
         struct tdb_lock_type global_lock;
+        struct tdb_lock_type allrecord_lock; /* .offset == upgradable */
         int num_lockrecs;
         struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */
 …
         unsigned int (*hash_fn)(TDB_DATA *key);
         int open_flags; /* flags used in the open - needed by reopen */
-        unsigned int num_locks; /* number of chain locks held */
         const struct tdb_methods *methods;
         struct tdb_transaction *transaction;
         int page_size;
         int max_dead_records;
-        int transaction_lock_count;
 #ifdef TDB_TRACE
         int tracefd;
 …
 int tdb_lock(struct tdb_context *tdb, int list, int ltype);
 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
+int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
+                  enum tdb_lock_flags flags);
+int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
+                    bool mark_lock);
 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
+int tdb_brlock(struct tdb_context *tdb, tdb_off_t offset, int rw_type, int lck_type, int probe, size_t len);
+int tdb_transaction_lock(struct tdb_context *tdb, int ltype);
+int tdb_transaction_unlock(struct tdb_context *tdb);
+int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len);
+int tdb_brlock(struct tdb_context *tdb,
+               int rw_type, tdb_off_t offset, size_t len,
+               enum tdb_lock_flags flags);
+int tdb_brunlock(struct tdb_context *tdb,
+                 int rw_type, tdb_off_t offset, size_t len);
+bool tdb_have_extra_locks(struct tdb_context *tdb);
+void tdb_release_transaction_locks(struct tdb_context *tdb);
+int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
+                         enum tdb_lock_flags lockflags);
+int tdb_transaction_unlock(struct tdb_context *tdb, int ltype);
+int tdb_recovery_area(struct tdb_context *tdb,
+                      const struct tdb_methods *methods,
+                      tdb_off_t *recovery_offset,
+                      struct tdb_record *rec);
+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+                       enum tdb_lock_flags flags, bool upgradable);
+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock);
+int tdb_allrecord_upgrade(struct tdb_context *tdb);
 int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off);
 int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off);
 …
 int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off);
 int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off);
 int _tdb_transaction_cancel(struct tdb_context *tdb);
+bool tdb_needs_recovery(struct tdb_context *tdb);
 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
 …
 int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off,
                       struct tdb_record *rec);
+bool tdb_write_all(int fd, const void *buf, size_t count);
+int tdb_transaction_recover(struct tdb_context *tdb);
+void tdb_header_hash(struct tdb_context *tdb,
+                     uint32_t *magic1_hash, uint32_t *magic2_hash);
+unsigned int tdb_old_hash(TDB_DATA *key);
+size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off);

trunk/server/lib/tdb/common/transaction.c

-              r657
+              r745
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
     existing transaction record. If the inner transaction is cancelled
     then a subsequent commit will fail
   - keep a mirrored copy of the tdb hash chain heads to allow for the
     fast hash heads scan on traverse, updating the mirrored copy in
 …
   - check for a valid recovery record on open of the tdb, while the
     global lock is held. Automatically recover from the transaction
+    open lock is held. Automatically recover from the transaction
     recovery area if needed, then continue with the open as
     usual. This allows for smooth crash recovery with no administrator
 …
         tdb_off_t magic_offset;
-        /* set when the GLOBAL_LOCK has been taken */
-        bool global_lock_taken;
         /* old file size before transaction */
         tdb_len_t old_map_size;
         /* we should re-pack on commit */
         bool need_repack;
+        /* did we expand in this transaction */
+        bool expanded;
 };
 …
+                }
+        }
         /* now copy it out of this block */
         memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
 …
+                }
+        }
         /* overwrite part of an existing block */
         if (buf == NULL) {
 …
+        }
+        tdb->transaction->need_repack = true;
+        return 0;
+}
+/*
+  brlock during a transaction - ignore them
+*/
+static int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
+                              int rw_type, int lck_type, int probe, size_t len)
+{
+        tdb->transaction->expanded = true;
         return 0;
+}
 …
         transaction_oob,
         transaction_expand_file,
-        transaction_brlock
 };
 …
   transaction is allowed to be pending per tdb_context
 */
+int tdb_transaction_start(struct tdb_context *tdb)
+static int _tdb_transaction_start(struct tdb_context *tdb,
+                                  enum tdb_lock_flags lockflags)
+{
         /* some sanity checks */
 …
+        }
         if (tdb->num_locks != 0 || tdb->global_lock.count) {
+        if (tdb_have_extra_locks(tdb)) {
                 /* the caller must not have any locks when starting a
                    transaction as otherwise we'll be screwed by lack
 …
            discussed with Volker, there are a number of ways we could
            make this async, which we will probably do in the future */
         if (tdb_transaction_lock(tdb, F_WRLCK) == -1) {
+        if (tdb_transaction_lock(tdb, F_WRLCK, lockflags) == -1) {
                 SAFE_FREE(tdb->transaction->blocks);
                 SAFE_FREE(tdb->transaction);
+                return -1;
+        }
+                if ((lockflags & TDB_LOCK_WAIT) == 0) {
+                        tdb->ecode = TDB_ERR_NOLOCK;
+                }
+                return -1;
+        }
         /* get a read lock from the freelist to the end of file. This
            is upgraded to a write lock during the commit */
 #ifndef __OS2__ // YD the transation lock is an exclusive lock for us, it is enough.
         if (tdb_brlock(tdb, FREELIST_TOP, F_RDLCK, F_SETLKW, 0, 0) == -1) {
+        if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
+                tdb->ecode = TDB_ERR_LOCK;
+                goto fail;
+                goto fail_allrecord_lock;
+        }
 #endif
 …
         tdb_trace(tdb, "tdb_transaction_start");
         return 0;
 fail:
 #ifndef __OS2__ // YD the transation lock is an exclusive lock for us, it is enough.
         tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
+        tdb_allrecord_unlock(tdb, F_RDLCK, false);
 #endif
+        tdb_transaction_unlock(tdb);
+fail_allrecord_lock:
+        tdb_transaction_unlock(tdb, F_WRLCK);
         SAFE_FREE(tdb->transaction->blocks);
         SAFE_FREE(tdb->transaction->hash_heads);
 …
+}
+_PUBLIC_ int tdb_transaction_start(struct tdb_context *tdb)
+{
+        return _tdb_transaction_start(tdb, TDB_LOCK_WAIT);
+}
+_PUBLIC_ int tdb_transaction_start_nonblock(struct tdb_context *tdb)
+{
+        return _tdb_transaction_start(tdb, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
+}
 /*
 …
+        }
+#ifdef HAVE_FDATASYNC
+        if (fdatasync(tdb->fd) != 0) {
+#else
         if (fsync(tdb->fd) != 0) {
+#endif
                 tdb->ecode = TDB_ERR_IO;
                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
 …
 int _tdb_transaction_cancel(struct tdb_context *tdb)
+static int _tdb_transaction_cancel(struct tdb_context *tdb)
+{
         int i, ret = 0;
 …
         if (tdb->transaction->magic_offset) {
                 const struct tdb_methods *methods = tdb->transaction->io_methods;
                 uint32_t zero = 0;
+                const uint32_t invalid = TDB_RECOVERY_INVALID_MAGIC;
                 /* remove the recovery marker */
                 if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &zero, 4) == -1 ||
+                if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &invalid, 4) == -1 ||
                 transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
 …
+        }
+        if (tdb->transaction->global_lock_taken) {
+                tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
+                tdb->transaction->global_lock_taken = false;
+        }
+        /* remove any global lock created during the transaction */
+        if (tdb->global_lock.count != 0) {
+                tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 4*tdb->header.hash_size);
+                tdb->global_lock.count = 0;
+        }
+        /* remove any locks created during the transaction */
+        if (tdb->num_locks != 0) {
+                for (i=0;i<tdb->num_lockrecs;i++) {
+                        tdb_brlock(tdb,FREELIST_TOP+4*tdb->lockrecs[i].list,
+                                   F_UNLCK,F_SETLKW, 0, 1);
+                }
+                tdb->num_locks = 0;
+                tdb->num_lockrecs = 0;
+                SAFE_FREE(tdb->lockrecs);
+        }
+        /* This also removes the OPEN_LOCK, if we have it. */
+        tdb_release_transaction_locks(tdb);
         /* restore the normal io methods */
         tdb->methods = tdb->transaction->io_methods;
-#ifndef __OS2__ // YD the transation lock is an exclusive lock for us, it is enough
-        tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
-#endif
-        tdb_transaction_unlock(tdb);
         SAFE_FREE(tdb->transaction->hash_heads);
         SAFE_FREE(tdb->transaction);
         return ret;
+}
 …
   cancel the current transaction
 */
 int tdb_transaction_cancel(struct tdb_context *tdb)
+_PUBLIC_ int tdb_transaction_cancel(struct tdb_context *tdb)
+{
         tdb_trace(tdb, "tdb_transaction_cancel");
 …
+}
+int tdb_recovery_area(struct tdb_context *tdb,
+                      const struct tdb_methods *methods,
+                      tdb_off_t *recovery_offset,
+                      struct tdb_record *rec)
+{
+        if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, recovery_offset) == -1) {
+                return -1;
+        }
+        if (*recovery_offset == 0) {
+                rec->rec_len = 0;
+                return 0;
+        }
+        if (methods->tdb_read(tdb, *recovery_offset, rec, sizeof(*rec),
+                              DOCONV()) == -1) {
+                return -1;
+        }
+        /* ignore invalid recovery regions: can happen in crash */
+        if (rec->magic != TDB_RECOVERY_MAGIC &&
+            rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
+                *recovery_offset = 0;
+                rec->rec_len = 0;
+        }
+        return 0;
+}
 /*
   allocate the recovery area, or use an existing recovery area if it is
 …
         tdb_off_t recovery_head;
         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
+        if (tdb_recovery_area(tdb, methods, &recovery_head, &rec) == -1) {
                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
-                return -1;
+        }
-        rec.rec_len = 0;
-        if (recovery_head != 0 &&
-            methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
-                TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery record\n"));
                 return -1;
+        }
 …
         memset(rec, 0, sizeof(*rec));
         rec->magic    = 0;
+        rec->magic    = TDB_RECOVERY_INVALID_MAGIC;
         rec->data_len = recovery_size;
         rec->rec_len  = recovery_max_size;
         rec->key_len  = old_map_size;
         CONVERT(rec);
+        CONVERT(*rec);
         /* build the recovery data into a single blob to allow us to do a single
 …
                         length = tdb->transaction->last_block_size;
+                }
                 if (offset >= old_map_size) {
                         continue;
 …
         tailer = sizeof(*rec) + recovery_max_size;
         memcpy(p, &tailer, 4);
+        CONVERT(p);
+        if (DOCONV()) {
+                tdb_convert(p, 4);
+        }
         /* write the recovery data to the recovery area */
 …
         methods = tdb->transaction->io_methods;
         /* if there are any locks pending then the caller has not
            nested their locks properly, so fail the transaction */
         if (tdb->num_locks || tdb->global_lock.count) {
+        if (tdb_have_extra_locks(tdb)) {
                 tdb->ecode = TDB_ERR_LOCK;
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
 …
         /* upgrade the main transaction lock region to a write lock */
 #ifndef __OS2__ // YD the global lock is an exclusive lock for us, it is enough
         if (tdb_brlock_upgrade(tdb, FREELIST_TOP, 0) == -1) {
+        if (tdb_allrecord_upgrade(tdb) == -1) {
                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to upgrade hash locks\n"));
-                tdb->ecode = TDB_ERR_LOCK;
                 _tdb_transaction_cancel(tdb);
                 return -1;
 …
 #endif
         /* get the global lock - this prevents new users attaching to the database
+        /* get the open lock - this prevents new users attaching to the database
            during the commit */
+        if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
+                TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get global lock\n"));
+                tdb->ecode = TDB_ERR_LOCK;
+        if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
+                TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get open lock\n"));
                 _tdb_transaction_cancel(tdb);
                 return -1;
+        }
-        tdb->transaction->global_lock_taken = true;
         if (!(tdb->flags & TDB_NOSYNC)) {
 …
+        }
         /* Keep the global lock until the actual commit */
+        /* Keep the open lock until the actual commit */
         return 0;
 …
    prepare to commit the current transaction
 */
 int tdb_transaction_prepare_commit(struct tdb_context *tdb)
+{
+_PUBLIC_ int tdb_transaction_prepare_commit(struct tdb_context *tdb)
+{
         tdb_trace(tdb, "tdb_transaction_prepare_commit");
         return _tdb_transaction_prepare_commit(tdb);
+}
+/* A repack is worthwhile if the largest is less than half total free. */
+static bool repack_worthwhile(struct tdb_context *tdb)
+{
+        tdb_off_t ptr;
+        struct tdb_record rec;
+        tdb_len_t total = 0, largest = 0;
+        if (tdb_ofs_read(tdb, FREELIST_TOP, &ptr) == -1) {
+                return false;
+        }
+        while (ptr != 0 && tdb_rec_free_read(tdb, ptr, &rec) == 0) {
+                total += rec.rec_len;
+                if (rec.rec_len > largest) {
+                        largest = rec.rec_len;
+                }
+                ptr = rec.next;
+        }
+        return total > largest * 2;
+}
 /*
   commit the current transaction
 */
 int tdb_transaction_commit(struct tdb_context *tdb)
+{
+_PUBLIC_ int tdb_transaction_commit(struct tdb_context *tdb)
+{
         const struct tdb_methods *methods;
         int i;
         bool need_repack;
+        bool need_repack = false;
         if (tdb->transaction == NULL) {
 …
         tdb_trace(tdb, "tdb_transaction_commit");
         if (tdb->transaction->transaction_error) {
                 tdb->ecode = TDB_ERR_IO;
 …
                 return -1;
+        }
         if (tdb->transaction->nesting != 0) {
 …
                         length = tdb->transaction->last_block_size;
+                }
                 if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
                         /* we've overwritten part of the data and
                            possibly expanded the file, so we need to
 …
                 SAFE_FREE(tdb->transaction->blocks[i]);
+        }
+        /* Do this before we drop lock or blocks. */
+        if (tdb->transaction->expanded) {
+                need_repack = repack_worthwhile(tdb);
+        }
         SAFE_FREE(tdb->transaction->blocks);
 …
 #endif
-        need_repack = tdb->transaction->need_repack;
         /* use a transaction cancel to free memory and remove the
            transaction locks */
 …
 /*
   recover from an aborted transaction. Must be called with exclusive
   database write access already established (including the global
+  database write access already established (including the open
   lock to prevent new processes attaching)
 */
 …
                 return -1;
+        }
-        /* reduce the file size to the old size */
-        tdb_munmap(tdb);
-        if (ftruncate(tdb->fd, recovery_eof) != 0) {
-                TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to reduce to recovery size\n"));
-                tdb->ecode = TDB_ERR_IO;
-                return -1;
+        }
-        tdb->map_size = recovery_eof;
-        tdb_mmap(tdb);
         if (transaction_sync(tdb, 0, recovery_eof) == -1) {
 …
         return 0;
+}
+/* Any I/O failures we say "needs recovery". */
+bool tdb_needs_recovery(struct tdb_context *tdb)
+{
+        tdb_off_t recovery_head;
+        struct tdb_record rec;
+        /* find the recovery area */
+        if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
+                return true;
+        }
+        if (recovery_head == 0) {
+                /* we have never allocated a recovery record */
+                return false;
+        }
+        /* read the recovery record */
+        if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
+                                   sizeof(rec), DOCONV()) == -1) {
+                return true;
+        }
+        return (rec.magic == TDB_RECOVERY_MAGIC);
+}

trunk/server/lib/tdb/common/traverse.c

-              r647
+              r745
    Copyright (C) Paul `Rusty' Russell              2000
    Copyright (C) Jeremy Allison                    2000-2003
      ** NOTE! The following LGPL license applies to the tdb
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
                            hashes are used. In that case we spend most of our
                            time in tdb_brlock(), locking empty hash chains.
                            To avoid this, we do an unlocked pre-check to see
                            if the hash chain is empty before starting to look
 …
                            lock, so instead we get the lock and re-fetch the
                            value below.
                            Notice that not doing this optimisation on the
                            first hash chain is critical. We must guarantee
 …
                            could miss them anyway without this trick, so the
                            semantics don't change.
                            With a non-indexed ldb search this trick gains us a
                            factor of around 80 in speed on a linux 2.6.x
 …
   a write style traverse - temporarily marks the db read only
 */
 int tdb_traverse_read(struct tdb_context *tdb,
+_PUBLIC_ int tdb_traverse_read(struct tdb_context *tdb,
                       tdb_traverse_func fn, void *private_data)
+{
         struct tdb_traverse_lock tl = { NULL, 0, 0, F_RDLCK };
         int ret;
 …
         /* we need to get a read lock on the transaction lock here to
            cope with the lock ordering semantics of solaris10 */
         if (tdb_transaction_lock(tdb, F_RDLCK)) {
+        if (tdb_transaction_lock(tdb, F_RDLCK, TDB_LOCK_WAIT)) {
                 return -1;
+        }
 …
         tdb->traverse_read--;
         tdb_transaction_unlock(tdb);
+        tdb_transaction_unlock(tdb, F_RDLCK);
         return ret;
 …
   alignment restrictions malloc gives you.
 */
 int tdb_traverse(struct tdb_context *tdb,
+_PUBLIC_ int tdb_traverse(struct tdb_context *tdb,
                  tdb_traverse_func fn, void *private_data)
+{
 …
+        }
         if (tdb_transaction_lock(tdb, F_WRLCK)) {
+        if (tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_WAIT)) {
                 return -1;
+        }
 …
         tdb->traverse_write--;
         tdb_transaction_unlock(tdb);
+        tdb_transaction_unlock(tdb, F_WRLCK);
         return ret;
 …
 /* find the first entry in the database and return its key */
 TDB_DATA tdb_firstkey(struct tdb_context *tdb)
+_PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
+{
         TDB_DATA key;
 …
 /* find the next entry in the database, returning its key */
 TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
+_PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
+{
         uint32_t oldhash;

trunk/server/lib/tdb/docs/README

-              r414
+              r745
 ----------------------------------------------------------------------
+int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
+                     int (*parser)(TDB_DATA key, TDB_DATA data,
+                                   void *private_data),
+                     void *private_data);
+   Hand a record to a parser function without allocating it.
+   This function is meant as a fast tdb_fetch alternative for large records
+   that are frequently read. The "key" and "data" arguments point directly
+   into the tdb shared memory, they are not aligned at any boundary.
+   WARNING: The parser is called while tdb holds a lock on the record. DO NOT
+   call other tdb routines from within the parser. Also, for good performance
+   you should make the parser fast to allow parallel operations.
+   tdb_parse_record returns -1 if the record was not found.  If the record was
+   found, the return value of "parser" is passed up to the caller.
+----------------------------------------------------------------------
 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key);

trunk/server/lib/tdb/include/tdb.h

-              r414
+              r745
 #endif
+#include "signal.h"
+/* flags to tdb_store() */
+#define TDB_REPLACE 1           /* Unused */
+#define TDB_INSERT 2            /* Don't overwrite an existing entry */
+#define TDB_MODIFY 3            /* Don't create an existing entry    */
+/* flags for tdb_open() */
+#define TDB_DEFAULT 0 /* just a readability place holder */
+#define TDB_CLEAR_IF_FIRST 1
+#define TDB_INTERNAL 2 /* don't store on disk */
+#define TDB_NOLOCK   4 /* don't do any locking */
+#define TDB_NOMMAP   8 /* don't use mmap */
+#define TDB_CONVERT 16 /* convert endian (internal use) */
+#define TDB_BIGENDIAN 32 /* header is big-endian (internal use) */
+#define TDB_NOSYNC   64 /* don't use synchronous transactions */
+#define TDB_SEQNUM   128 /* maintain a sequence number */
+#define TDB_VOLATILE   256 /* Activate the per-hashchain freelist, default 5 */
+#define TDB_ALLOW_NESTING 512 /* Allow transactions to nest */
+#define TDB_DISALLOW_NESTING 1024 /* Disallow transactions to nest */
+/* error codes */
+#include <signal.h>
+/**
+ * @defgroup tdb The tdb API
+ *
+ * tdb is a Trivial database. In concept, it is very much like GDBM, and BSD's
+ * DB except that it allows multiple simultaneous writers and uses locking
+ * internally to keep writers from trampling on each other. tdb is also
+ * extremely small.
+ *
+ * @section tdb_interface Interface
+ *
+ * The interface is very similar to gdbm except for the following:
+ *
+ * <ul>
+ * <li>different open interface. The tdb_open call is more similar to a
+ * traditional open()</li>
+ * <li>no tdbm_reorganise() function</li>
+ * <li>no tdbm_sync() function. No operations are cached in the library
+ *     anyway</li>
+ * <li>added a tdb_traverse() function for traversing the whole database</li>
+ * <li>added transactions support</li>
+ * </ul>
+ *
+ * A general rule for using tdb is that the caller frees any returned TDB_DATA
+ * structures. Just call free(p.dptr) to free a TDB_DATA return value called p.
+ * This is the same as gdbm.
+ *
+ * @{
+ */
+/** Flags to tdb_store() */
+#define TDB_REPLACE 1           /** Unused */
+#define TDB_INSERT 2            /** Don't overwrite an existing entry */
+#define TDB_MODIFY 3            /** Don't create an existing entry    */
+/** Flags for tdb_open() */
+#define TDB_DEFAULT 0 /** just a readability place holder */
+#define TDB_CLEAR_IF_FIRST 1 /** If this is the first open, wipe the db */
+#define TDB_INTERNAL 2 /** Don't store on disk */
+#define TDB_NOLOCK   4 /** Don't do any locking */
+#define TDB_NOMMAP   8 /** Don't use mmap */
+#define TDB_CONVERT 16 /** Convert endian (internal use) */
+#define TDB_BIGENDIAN 32 /** Header is big-endian (internal use) */
+#define TDB_NOSYNC   64 /** Don't use synchronous transactions */
+#define TDB_SEQNUM   128 /** Maintain a sequence number */
+#define TDB_VOLATILE   256 /** Activate the per-hashchain freelist, default 5 */
+#define TDB_ALLOW_NESTING 512 /** Allow transactions to nest */
+#define TDB_DISALLOW_NESTING 1024 /** Disallow transactions to nest */
+#define TDB_INCOMPATIBLE_HASH 2048 /** Better hashing: can't be opened by tdb < 1.2.6. */
+/** The tdb error codes */
 enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK,
                 TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT,
 …
                 TDB_ERR_NESTING};
 /* debugging uses one of the following levels */
+/** Debugging uses one of the following levels */
 enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR,
                       TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
+/** The tdb data structure */
 typedef struct TDB_DATA {
         unsigned char *dptr;
 …
 #endif
 /* this is the context structure that is returned from a db open */
+/** This is the context structure that is returned from a db open. */
 typedef struct tdb_context TDB_CONTEXT;
 …
 };
+/**
+ * @brief Open the database and creating it if necessary.
+ *
+ * @param[in]  name     The name of the db to open.
+ *
+ * @param[in]  hash_size The hash size is advisory, use zero for a default
+ *                       value.
+ *
+ * @param[in]  tdb_flags The flags to use to open the db:\n\n
+ *                         TDB_CLEAR_IF_FIRST - Clear database if we are the
+ *                                              only one with it open\n
+ *                         TDB_INTERNAL - Don't use a file, instaed store the
+ *                                        data in memory. The filename is
+ *                                        ignored in this case.\n
+ *                         TDB_NOLOCK - Don't do any locking\n
+ *                         TDB_NOMMAP - Don't use mmap\n
+ *                         TDB_NOSYNC - Don't synchronise transactions to disk\n
+ *                         TDB_SEQNUM - Maintain a sequence number\n
+ *                         TDB_VOLATILE - activate the per-hashchain freelist,
+ *                                        default 5.\n
+ *                         TDB_ALLOW_NESTING - Allow transactions to nest.\n
+ *                         TDB_DISALLOW_NESTING - Disallow transactions to nest.\n
+ *
+ * @param[in]  open_flags Flags for the open(2) function.
+ *
+ * @param[in]  mode     The mode for the open(2) function.
+ *
+ * @return              A tdb context structure, NULL on error.
+ */
 struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
                       int open_flags, mode_t mode);
+/**
+ * @brief Open the database and creating it if necessary.
+ *
+ * This is like tdb_open(), but allows you to pass an initial logging and
+ * hash function. Be careful when passing a hash function - all users of the
+ * database must use the same hash function or you will get data corruption.
+ *
+ * @param[in]  name     The name of the db to open.
+ *
+ * @param[in]  hash_size The hash size is advisory, use zero for a default
+ *                       value.
+ *
+ * @param[in]  tdb_flags The flags to use to open the db:\n\n
+ *                         TDB_CLEAR_IF_FIRST - Clear database if we are the
+ *                                              only one with it open\n
+ *                         TDB_INTERNAL - Don't use a file, instaed store the
+ *                                        data in memory. The filename is
+ *                                        ignored in this case.\n
+ *                         TDB_NOLOCK - Don't do any locking\n
+ *                         TDB_NOMMAP - Don't use mmap\n
+ *                         TDB_NOSYNC - Don't synchronise transactions to disk\n
+ *                         TDB_SEQNUM - Maintain a sequence number\n
+ *                         TDB_VOLATILE - activate the per-hashchain freelist,
+ *                                        default 5.\n
+ *                         TDB_ALLOW_NESTING - Allow transactions to nest.\n
+ *                         TDB_DISALLOW_NESTING - Disallow transactions to nest.\n
+ *
+ * @param[in]  open_flags Flags for the open(2) function.
+ *
+ * @param[in]  mode     The mode for the open(2) function.
+ *
+ * @param[in]  log_ctx  The logging function to use.
+ *
+ * @param[in]  hash_fn  The hash function you want to use.
+ *
+ * @return              A tdb context structure, NULL on error.
+ *
+ * @see tdb_open()
+ */
 struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
                          int open_flags, mode_t mode,
                          const struct tdb_logging_context *log_ctx,
                          tdb_hash_func hash_fn);
+/**
+ * @brief Set the maximum number of dead records per hash chain.
+ *
+ * @param[in]  tdb      The database handle to set the maximum.
+ *
+ * @param[in]  max_dead The maximum number of dead records per hash chain.
+ */
 void tdb_set_max_dead(struct tdb_context *tdb, int max_dead);
+/**
+ * @brief Reopen a tdb.
+ *
+ * This can be used after a fork to ensure that we have an independent seek
+ * pointer from our parent and to re-establish locks.
+ *
+ * @param[in]  tdb      The database to reopen.
+ *
+ * @return              0 on success, -1 on error.
+ */
 int tdb_reopen(struct tdb_context *tdb);
+/**
+ * @brief Reopen all tdb's
+ *
+ * If the parent is longlived (ie. a parent daemon architecture), we know it
+ * will keep it's active lock on a tdb opened with CLEAR_IF_FIRST. Thus for
+ * child processes we don't have to add an active lock. This is essential to
+ * improve performance on systems that keep POSIX locks as a non-scalable data
+ * structure in the kernel.
+ *
+ * @param[in]  parent_longlived Wether the parent is longlived or not.
+ *
+ * @return              0 on success, -1 on error.
+ */
 int tdb_reopen_all(int parent_longlived);
+/**
+ * @brief Set a different tdb logging function.
+ *
+ * @param[in]  tdb      The tdb to set the logging function.
+ *
+ * @param[in]  log_ctx  The logging function to set.
+ */
 void tdb_set_logging_function(struct tdb_context *tdb, const struct tdb_logging_context *log_ctx);
+/**
+ * @brief Get the tdb last error code.
+ *
+ * @param[in]  tdb      The tdb to get the error code from.
+ *
+ * @return              A TDB_ERROR code.
+ *
+ * @see TDB_ERROR
+ */
 enum TDB_ERROR tdb_error(struct tdb_context *tdb);
+/**
+ * @brief Get a error string for the last tdb error
+ *
+ * @param[in]  tdb      The tdb to get the error code from.
+ *
+ * @return              An error string.
+ */
 const char *tdb_errorstr(struct tdb_context *tdb);
+/**
+ * @brief Fetch an entry in the database given a key.
+ *
+ * The caller must free the resulting data.
+ *
+ * @param[in]  tdb      The tdb to fetch the key.
+ *
+ * @param[in]  key      The key to fetch.
+ *
+ * @return              The key entry found in the database, NULL on error with
+ *                      TDB_ERROR set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
+/**
+ * @brief Hand a record to a parser function without allocating it.
+ *
+ * This function is meant as a fast tdb_fetch alternative for large records
+ * that are frequently read. The "key" and "data" arguments point directly
+ * into the tdb shared memory, they are not aligned at any boundary.
+ *
+ * @warning The parser is called while tdb holds a lock on the record. DO NOT
+ * call other tdb routines from within the parser. Also, for good performance
+ * you should make the parser fast to allow parallel operations.
+ *
+ * @param[in]  tdb      The tdb to parse the record.
+ *
+ * @param[in]  key      The key to parse.
+ *
+ * @param[in]  parser   The parser to use to parse the data.
+ *
+ * @param[in]  private_data A private data pointer which is passed to the parser
+ *                          function.
+ *
+ * @return              -1 if the record was not found. If the record was found,
+ *                      the return value of "parser" is passed up to the caller.
+ */
 int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
+                     int (*parser)(TDB_DATA key, TDB_DATA data,
+                                   void *private_data),
+                     void *private_data);
+                              int (*parser)(TDB_DATA key, TDB_DATA data,
+                                            void *private_data),
+                              void *private_data);
+/**
+ * @brief Delete an entry in the database given a key.
+ *
+ * @param[in]  tdb      The tdb to delete the key.
+ *
+ * @param[in]  key      The key to delete.
+ *
+ * @return              0 on success, -1 if the key doesn't exist.
+ */
 int tdb_delete(struct tdb_context *tdb, TDB_DATA key);
+/**
+ * @brief Store an element in the database.
+ *
+ * This replaces any existing element with the same key.
+ *
+ * @param[in]  tdb      The tdb to store the entry.
+ *
+ * @param[in]  key      The key to use to store the entry.
+ *
+ * @param[in]  dbuf     The data to store under the key.
+ *
+ * @param[in]  flag     The flags to store the key:\n\n
+ *                      TDB_INSERT: Don't overwrite an existing entry.\n
+ *                      TDB_MODIFY: Don't create a new entry\n
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag);
+/**
+ * @brief Append data to an entry.
+ *
+ * If the entry doesn't exist, it will create a new one.
+ *
+ * @param[in]  tdb      The database to use.
+ *
+ * @param[in]  key      The key to append the data.
+ *
+ * @param[in]  new_dbuf The data to append to the key.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf);
+/**
+ * @brief Close a database.
+ *
+ * @param[in]  tdb      The database to close.
+ *
+ * @return              0 for success, -1 on error.
+ */
 int tdb_close(struct tdb_context *tdb);
+/**
+ * @brief Find the first entry in the database and return its key.
+ *
+ * The caller must free the returned data.
+ *
+ * @param[in]  tdb      The database to use.
+ *
+ * @return              The first entry of the database, an empty TDB_DATA entry
+ *                      if the database is empty.
+ */
 TDB_DATA tdb_firstkey(struct tdb_context *tdb);
+/**
+ * @brief Find the next entry in the database, returning its key.
+ *
+ * The caller must free the returned data.
+ *
+ * @param[in]  tdb      The database to use.
+ *
+ * @param[in]  key      The key from which you want the next key.
+ *
+ * @return              The next entry of the current key, an empty TDB_DATA
+ *                      entry if there is no entry.
+ */
 TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA key);
+int tdb_traverse(struct tdb_context *tdb, tdb_traverse_func fn, void *);
+int tdb_traverse_read(struct tdb_context *tdb, tdb_traverse_func fn, void *);
+/**
+ * @brief Traverse the entire database.
+ *
+ * While travering the function fn(tdb, key, data, state) is called on each
+ * element. If fn is NULL then it is not called. A non-zero return value from
+ * fn() indicates that the traversal should stop. Traversal callbacks may not
+ * start transactions.
+ *
+ * @warning The data buffer given to the callback fn does NOT meet the alignment
+ * restrictions malloc gives you.
+ *
+ * @param[in]  tdb      The database to traverse.
+ *
+ * @param[in]  fn       The function to call on each entry.
+ *
+ * @param[in]  private_data The private data which should be passed to the
+ *                          traversing function.
+ *
+ * @return              The record count traversed, -1 on error.
+ */
+int tdb_traverse(struct tdb_context *tdb, tdb_traverse_func fn, void *private_data);
+/**
+ * @brief Traverse the entire database.
+ *
+ * While traversing the database the function fn(tdb, key, data, state) is
+ * called on each element, but marking the database read only during the
+ * traversal, so any write operations will fail. This allows tdb to use read
+ * locks, which increases the parallelism possible during the traversal.
+ *
+ * @param[in]  tdb      The database to traverse.
+ *
+ * @param[in]  fn       The function to call on each entry.
+ *
+ * @param[in]  private_data The private data which should be passed to the
+ *                          traversing function.
+ *
+ * @return              The record count traversed, -1 on error.
+ */
+int tdb_traverse_read(struct tdb_context *tdb, tdb_traverse_func fn, void *private_data);
+/**
+ * @brief Check if an entry in the database exists.
+ *
+ * @note 1 is returned if the key is found and 0 is returned if not found this
+ * doesn't match the conventions in the rest of this module, but is compatible
+ * with gdbm.
+ *
+ * @param[in]  tdb      The database to check if the entry exists.
+ *
+ * @param[in]  key      The key to check if the entry exists.
+ *
+ * @return              1 if the key is found, 0 if not.
+ */
 int tdb_exists(struct tdb_context *tdb, TDB_DATA key);
+/**
+ * @brief Lock entire database with a write lock.
+ *
+ * @param[in]  tdb      The database to lock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_lockall(struct tdb_context *tdb);
+/**
+ * @brief Lock entire database with a write lock.
+ *
+ * This is the non-blocking call.
+ *
+ * @param[in]  tdb      The database to lock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_lockall()
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_lockall_nonblock(struct tdb_context *tdb);
+/**
+ * @brief Unlock entire database with write lock.
+ *
+ * @param[in]  tdb      The database to unlock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_lockall()
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_unlockall(struct tdb_context *tdb);
+/**
+ * @brief Lock entire database with a read lock.
+ *
+ * @param[in]  tdb      The database to lock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_lockall_read(struct tdb_context *tdb);
+/**
+ * @brief Lock entire database with a read lock.
+ *
+ * This is the non-blocking call.
+ *
+ * @param[in]  tdb      The database to lock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_lockall_read()
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_lockall_read_nonblock(struct tdb_context *tdb);
+/**
+ * @brief Unlock entire database with read lock.
+ *
+ * @param[in]  tdb      The database to unlock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_lockall_read()
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_unlockall_read(struct tdb_context *tdb);
+/**
+ * @brief Lock entire database with write lock - mark only.
+ *
+ * @todo Add more details.
+ *
+ * @param[in]  tdb      The database to mark.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_lockall_mark(struct tdb_context *tdb);
+/**
+ * @brief Lock entire database with write lock - unmark only.
+ *
+ * @todo Add more details.
+ *
+ * @param[in]  tdb      The database to mark.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_lockall_unmark(struct tdb_context *tdb);
+/**
+ * @brief Get the name of the current tdb file.
+ *
+ * This is useful for external logging functions.
+ *
+ * @param[in]  tdb      The database to get the name from.
+ *
+ * @return              The name of the database.
+ */
 const char *tdb_name(struct tdb_context *tdb);
+/**
+ * @brief Get the underlying file descriptor being used by tdb.
+ *
+ * This is useful for external routines that want to check the device/inode
+ * of the fd.
+ *
+ * @param[in]  tdb      The database to get the fd from.
+ *
+ * @return              The file descriptor or -1.
+ */
 int tdb_fd(struct tdb_context *tdb);
+/**
+ * @brief Get the current logging function.
+ *
+ * This is useful for external tdb routines that wish to log tdb errors.
+ *
+ * @param[in]  tdb      The database to get the logging function from.
+ *
+ * @return              The logging function of the database.
+ *
+ * @see tdb_get_logging_private()
+ */
 tdb_log_func tdb_log_fn(struct tdb_context *tdb);
+/**
+ * @brief Get the private data of the logging function.
+ *
+ * @param[in]  tdb      The database to get the data from.
+ *
+ * @return              The private data pointer of the logging function.
+ *
+ * @see tdb_log_fn()
+ */
 void *tdb_get_logging_private(struct tdb_context *tdb);
+/**
+ * @brief Start a transaction.
+ *
+ * All operations after the transaction start can either be committed with
+ * tdb_transaction_commit() or cancelled with tdb_transaction_cancel().
+ *
+ * If you call tdb_transaction_start() again on the same tdb context while a
+ * transaction is in progress, then the same transaction buffer is re-used. The
+ * number of tdb_transaction_{commit,cancel} operations must match the number
+ * of successful tdb_transaction_start() calls.
+ *
+ * Note that transactions are by default disk synchronous, and use a recover
+ * area in the database to automatically recover the database on the next open
+ * if the system crashes during a transaction. You can disable the synchronous
+ * transaction recovery setup using the TDB_NOSYNC flag, which will greatly
+ * speed up operations at the risk of corrupting your database if the system
+ * crashes.
+ *
+ * Operations made within a transaction are not visible to other users of the
+ * database until a successful commit.
+ *
+ * @param[in]  tdb      The database to start the transaction.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_transaction_start(struct tdb_context *tdb);
+/**
+ * @brief Start a transaction, non-blocking.
+ *
+ * @param[in]  tdb      The database to start the transaction.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ * @see tdb_transaction_start()
+ */
+int tdb_transaction_start_nonblock(struct tdb_context *tdb);
+/**
+ * @brief Prepare to commit a current transaction, for two-phase commits.
+ *
+ * Once prepared for commit, the only allowed calls are tdb_transaction_commit()
+ * or tdb_transaction_cancel(). Preparing allocates disk space for the pending
+ * updates, so a subsequent commit should succeed (barring any hardware
+ * failures).
+ *
+ * @param[in]  tdb      The database to prepare the commit.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_transaction_prepare_commit(struct tdb_context *tdb);
+/**
+ * @brief Commit a current transaction.
+ *
+ * This updates the database and releases the current transaction locks.
+ *
+ * @param[in]  tdb      The database to commit the transaction.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_transaction_commit(struct tdb_context *tdb);
+/**
+ * @brief Cancel a current transaction.
+ *
+ * This discards all write and lock operations that have been made since the
+ * transaction started.
+ *
+ * @param[in]  tdb      The tdb to cancel the transaction on.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_transaction_cancel(struct tdb_context *tdb);
+int tdb_transaction_recover(struct tdb_context *tdb);
+/**
+ * @brief Get the tdb sequence number.
+ *
+ * Only makes sense if the writers opened with TDB_SEQNUM set. Note that this
+ * sequence number will wrap quite quickly, so it should only be used for a
+ * 'has something changed' test, not for code that relies on the count of the
+ * number of changes made. If you want a counter then use a tdb record.
+ *
+ * The aim of this sequence number is to allow for a very lightweight test of a
+ * possible tdb change.
+ *
+ * @param[in]  tdb      The database to get the sequence number from.
+ *
+ * @return              The sequence number or 0.
+ *
+ * @see tdb_open()
+ * @see tdb_enable_seqnum()
+ */
 int tdb_get_seqnum(struct tdb_context *tdb);
+/**
+ * @brief Get the hash size.
+ *
+ * @param[in]  tdb      The database to get the hash size from.
+ *
+ * @return              The hash size.
+ */
 int tdb_hash_size(struct tdb_context *tdb);
+/**
+ * @brief Get the map size.
+ *
+ * @param[in]  tdb     The database to get the map size from.
+ *
+ * @return             The map size.
+ */
 size_t tdb_map_size(struct tdb_context *tdb);
+/**
+ * @brief Get the tdb flags set during open.
+ *
+ * @param[in]  tdb      The database to get the flags form.
+ *
+ * @return              The flags set to on the database.
+ */
 int tdb_get_flags(struct tdb_context *tdb);
+/**
+ * @brief Add flags to the database.
+ *
+ * @param[in]  tdb      The database to add the flags.
+ *
+ * @param[in]  flag     The tdb flags to add.
+ */
 void tdb_add_flags(struct tdb_context *tdb, unsigned flag);
+/**
+ * @brief Remove flags from the database.
+ *
+ * @param[in]  tdb      The database to remove the flags.
+ *
+ * @param[in]  flag     The tdb flags to remove.
+ */
 void tdb_remove_flags(struct tdb_context *tdb, unsigned flag);
+/**
+ * @brief Enable sequence number handling on an open tdb.
+ *
+ * @param[in]  tdb      The database to enable sequence number handling.
+ *
+ * @see tdb_get_seqnum()
+ */
 void tdb_enable_seqnum(struct tdb_context *tdb);
+/**
+ * @brief Increment the tdb sequence number.
+ *
+ * This only works if the tdb has been opened using the TDB_SEQNUM flag or
+ * enabled useing tdb_enable_seqnum().
+ *
+ * @param[in]  tdb      The database to increment the sequence number.
+ *
+ * @see tdb_enable_seqnum()
+ * @see tdb_get_seqnum()
+ */
 void tdb_increment_seqnum_nonblock(struct tdb_context *tdb);
+/**
+ * @brief Create a hash of the key.
+ *
+ * @param[in]  key      The key to hash
+ *
+ * @return              The hash.
+ */
+unsigned int tdb_jenkins_hash(TDB_DATA *key);
+/**
+ * @brief Check the consistency of the database.
+ *
+ * This check the consistency of the database calling back the check function
+ * (if non-NULL) on each record.  If some consistency check fails, or the
+ * supplied check function returns -1, tdb_check returns -1, otherwise 0.
+ *
+ * @note The logging function (if set) will be called with additional
+ * information on the corruption found.
+ *
+ * @param[in]  tdb      The database to check.
+ *
+ * @param[in]  check    The check function to use.
+ *
+ * @param[in]  private_data the private data to pass to the check function.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
 int tdb_check(struct tdb_context *tdb,
               int (*check) (TDB_DATA key, TDB_DATA data, void *private_data),
               void *private_data);
+/* @} ******************************************************************/
 /* Low level locking functions: use with care */
 …
 int tdb_validate_freelist(struct tdb_context *tdb, int *pnum_entries);
 int tdb_freelist_size(struct tdb_context *tdb);
+char *tdb_summary(struct tdb_context *tdb);
 extern TDB_DATA tdb_null;

trunk/server/lib/tdb/libtdb.m4

-              r414
+              r745
 fi
 TDB_OBJ="common/tdb.o common/dump.o common/transaction.o common/error.o common/traverse.o"
 TDB_OBJ="$TDB_OBJ common/freelist.o common/freelistcheck.o common/io.o common/lock.o common/open.o common/check.o"
+TDB_OBJ="$TDB_OBJ common/freelist.o common/freelistcheck.o common/io.o common/lock.o common/open.o common/check.o common/hash.o common/summary.o"
 AC_SUBST(TDB_OBJ)
 AC_SUBST(LIBREPLACEOBJ)
 …
 TDB_LIBS=""
 AC_SUBST(TDB_LIBS)
+TDB_DEPS=""
+if test x$libreplace_cv_HAVE_FDATASYNC_IN_LIBRT = xyes ; then
+        TDB_DEPS="$TDB_DEPS -lrt"
+fi
+AC_SUBST(TDB_DEPS)
 TDB_CFLAGS="-I$tdbdir/include"

trunk/server/lib/tdb/manpages/tdbbackup.8.xml

-              r414
+              r745
 <?xml version="1.0" encoding="iso-8859-1"?>
 <!DOCTYPE refentry PUBLIC "-//Samba-Team//DTD DocBook V4.2-Based Variant V1.0//EN" "http://www.samba.org/samba/DTD/samba-doc">
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
 <refentry id="tdbbackup.8">
 …
         <refmiscinfo class="source">Samba</refmiscinfo>
         <refmiscinfo class="manual">System Administration tools</refmiscinfo>
         <refmiscinfo class="version">3.5</refmiscinfo>
+        <refmiscinfo class="version">3.6</refmiscinfo>
 </refmeta>

trunk/server/lib/tdb/manpages/tdbdump.8.xml

-              r414
+              r745
 <?xml version="1.0" encoding="iso-8859-1"?>
 <!DOCTYPE refentry PUBLIC "-//Samba-Team//DTD DocBook V4.2-Based Variant V1.0//EN" "http://www.samba.org/samba/DTD/samba-doc">
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
 <refentry id="tdbdump.8">
 …
         <refmiscinfo class="source">Samba</refmiscinfo>
         <refmiscinfo class="manual">System Administration tools</refmiscinfo>
         <refmiscinfo class="version">3.5</refmiscinfo>
+        <refmiscinfo class="version">3.6</refmiscinfo>
 </refmeta>

trunk/server/lib/tdb/manpages/tdbtool.8.xml

-              r414
+              r745
 <?xml version="1.0" encoding="iso-8859-1"?>
 <!DOCTYPE refentry PUBLIC "-//Samba-Team//DTD DocBook V4.2-Based Variant V1.0//EN" "http://www.samba.org/samba/DTD/samba-doc">
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
 <refentry id="tdbtool.8">
 …
         <refmiscinfo class="source">Samba</refmiscinfo>
         <refmiscinfo class="manual">System Administration tools</refmiscinfo>
         <refmiscinfo class="version">3.5</refmiscinfo>
+        <refmiscinfo class="version">3.6</refmiscinfo>
 </refmeta>

trunk/server/lib/tdb/pytdb.c

-              r414
+              r745
      ** library. This does NOT imply that all of Samba is released
      ** under the LGPL
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 …
 */
+#include <Python.h>
 #include "replace.h"
 #include "system/filesys.h"
-#include <Python.h>
 #ifndef Py_RETURN_NONE
 #define Py_RETURN_NONE return Py_INCREF(Py_None), Py_None
 …
 } PyTdbObject;
 PyAPI_DATA(PyTypeObject) PyTdb;
+staticforward PyTypeObject PyTdb;
 static void PyErr_SetTDBError(TDB_CONTEXT *tdb)
 …
 static PyObject *py_tdb_open(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
         char *name;
+        char *name = NULL;
         int hash_size = 0, tdb_flags = TDB_DEFAULT, flags = O_RDWR, mode = 0600;
         TDB_CONTEXT *ctx;
 …
         const char *kwnames[] = { "name", "hash_size", "tdb_flags", "flags", "mode", NULL };
+        if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|iiii", (char **)kwnames, &name, &hash_size, &tdb_flags, &flags, &mode))
+                return NULL;
+        if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|siiii", (char **)kwnames, &name, &hash_size, &tdb_flags, &flags, &mode))
+                return NULL;
+        if (name == NULL) {
+                tdb_flags |= TDB_INTERNAL;
+        }
         ctx = tdb_open(name, hash_size, tdb_flags, flags, mode);
 …
         ret = PyObject_New(PyTdbObject, &PyTdb);
+        if (!ret) {
+                tdb_close(ctx);
+                return NULL;
+        }
         ret->ctx = ctx;
         ret->closed = false;
 …
+}
 static PyObject *obj_transaction_recover(PyTdbObject *self)
+{
         int ret = tdb_transaction_recover(self->ctx);
+static PyObject *obj_transaction_prepare_commit(PyTdbObject *self)
+{
+        int ret = tdb_transaction_prepare_commit(self->ctx);
         PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
         Py_RETURN_NONE;
 …
+}
+static PyObject *obj_add_flags(PyTdbObject *self, PyObject *args)
+{
+        unsigned flags;
+        if (!PyArg_ParseTuple(args, "I", &flags))
+                return NULL;
+        tdb_add_flags(self->ctx, flags);
+        Py_RETURN_NONE;
+}
+static PyObject *obj_remove_flags(PyTdbObject *self, PyObject *args)
+{
+        unsigned flags;
+        if (!PyArg_ParseTuple(args, "I", &flags))
+                return NULL;
+        tdb_remove_flags(self->ctx, flags);
+        Py_RETURN_NONE;
+}
 typedef struct {
 …
         ret = PyObject_New(PyTdbIteratorObject, &PyTdbIterator);
+        if (!ret)
+                return NULL;
         ret->current = tdb_firstkey(self->ctx);
         ret->iteratee = self;
 …
         int ret = tdb_wipe_all(self->ctx);
         PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+        Py_RETURN_NONE;
+}
+static PyObject *obj_repack(PyTdbObject *self)
+{
+        int ret = tdb_repack(self->ctx);
+        PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+        Py_RETURN_NONE;
+}
+static PyObject *obj_enable_seqnum(PyTdbObject *self)
+{
+        tdb_enable_seqnum(self->ctx);
+        Py_RETURN_NONE;
+}
+static PyObject *obj_increment_seqnum_nonblock(PyTdbObject *self)
+{
+        tdb_increment_seqnum_nonblock(self->ctx);
         Py_RETURN_NONE;
+}
 …
                 "S.transaction_commit() -> None\n"
                 "Commit the currently active transaction." },
         { "transaction_recover", (PyCFunction)obj_transaction_recover, METH_NOARGS,
                 "S.transaction_recover() -> None\n"
                 "Recover the currently active transaction." },
+        { "transaction_prepare_commit", (PyCFunction)obj_transaction_prepare_commit, METH_NOARGS,
+                "S.transaction_prepare_commit() -> None\n"
+                "Prepare to commit the currently active transaction" },
         { "transaction_start", (PyCFunction)obj_transaction_start, METH_NOARGS,
                 "S.transaction_start() -> None\n"
 …
         { "store", (PyCFunction)obj_store, METH_VARARGS, "S.store(key, data, flag=REPLACE) -> None"
                 "Store data." },
+        { "add_flags", (PyCFunction)obj_add_flags, METH_VARARGS, "S.add_flags(flags) -> None" },
+        { "remove_flags", (PyCFunction)obj_remove_flags, METH_VARARGS, "S.remove_flags(flags) -> None" },
         { "iterkeys", (PyCFunction)tdb_object_iter, METH_NOARGS, "S.iterkeys() -> iterator" },
         { "clear", (PyCFunction)obj_clear, METH_NOARGS, "S.clear() -> None\n"
                 "Wipe the entire database." },
+        { "repack", (PyCFunction)obj_repack, METH_NOARGS, "S.repack() -> None\n"
+                "Repack the entire database." },
+        { "enable_seqnum", (PyCFunction)obj_enable_seqnum, METH_NOARGS,
+                "S.enable_seqnum() -> None" },
+        { "increment_seqnum_nonblock", (PyCFunction)obj_increment_seqnum_nonblock, METH_NOARGS,
+                "S.increment_seqnum_nonblock() -> None" },
         { NULL }
 };
 …
+}
+static PyObject *obj_get_freelist_size(PyTdbObject *self, void *closure)
+{
+        return PyInt_FromLong(tdb_freelist_size(self->ctx));
+}
 static PyObject *obj_get_flags(PyTdbObject *self, void *closure)
+{
 …
         return PyString_FromString(tdb_name(self->ctx));
+}
+static PyObject *obj_get_seqnum(PyTdbObject *self, void *closure)
+{
+        return PyInt_FromLong(tdb_get_seqnum(self->ctx));
+}
 static PyGetSetDef tdb_object_getsetters[] = {
         { (char *)"hash_size", (getter)obj_get_hash_size, NULL, NULL },
         { (char *)"map_size", (getter)obj_get_map_size, NULL, NULL },
+        { (char *)"freelist_size", (getter)obj_get_freelist_size, NULL, NULL },
         { (char *)"flags", (getter)obj_get_flags, NULL, NULL },
         { (char *)"max_dead", NULL, (setter)obj_set_max_dead, NULL },
         { (char *)"filename", (getter)obj_get_filename, NULL, (char *)"The filename of this TDB file."},
+        { (char *)"seqnum", (getter)obj_get_seqnum, NULL, NULL },
         { NULL }
 };
 …
 static PyObject *tdb_object_repr(PyTdbObject *self)
+{
+        return PyString_FromFormat("Tdb('%s')", tdb_name(self->ctx));
+        if (tdb_get_flags(self->ctx) & TDB_INTERNAL) {
+                return PyString_FromString("Tdb(<internal>)");
+        } else {
+                return PyString_FromFormat("Tdb('%s')", tdb_name(self->ctx));
+        }
+}
 …
         if (!self->closed)
                 tdb_close(self->ctx);
         PyObject_Del(self);
+        self->ob_type->tp_free(self);
+}
 …
         .mp_ass_subscript = (objobjargproc)obj_setitem,
 };
 PyTypeObject PyTdb = {
+static PyTypeObject PyTdb = {
         .tp_name = "Tdb",
         .tp_basicsize = sizeof(PyTdbObject),
 …
 };
+void inittdb(void);
 void inittdb(void)
+{
 …
         PyModule_AddObject(m, "CONVERT", PyInt_FromLong(TDB_CONVERT));
         PyModule_AddObject(m, "BIGENDIAN", PyInt_FromLong(TDB_BIGENDIAN));
+        PyModule_AddObject(m, "NOSYNC", PyInt_FromLong(TDB_NOSYNC));
+        PyModule_AddObject(m, "SEQNUM", PyInt_FromLong(TDB_SEQNUM));
+        PyModule_AddObject(m, "VOLATILE", PyInt_FromLong(TDB_VOLATILE));
+        PyModule_AddObject(m, "ALLOW_NESTING", PyInt_FromLong(TDB_ALLOW_NESTING));
+        PyModule_AddObject(m, "DISALLOW_NESTING", PyInt_FromLong(TDB_DISALLOW_NESTING));
+        PyModule_AddObject(m, "INCOMPATIBLE_HASH", PyInt_FromLong(TDB_INCOMPATIBLE_HASH));
         PyModule_AddObject(m, "__docformat__", PyString_FromString("restructuredText"));
+        PyModule_AddObject(m, "__version__", PyString_FromString(PACKAGE_VERSION));
         Py_INCREF(&PyTdb);

trunk/server/lib/tdb/python/tdbdump.py

r414	r745
1		#!/usr/bin/python
	1	#!/usr/bin/env python
2	2	# Trivial reimplementation of tdbdump in Python
3	3

trunk/server/lib/tdb/python/tests/simple.py

-              r414
+              r745
 #!/usr/bin/python
+#!/usr/bin/env python
 # Some simple tests for the Python bindings for TDB
 # Note that this tests the interface of the Python bindings
 …
 class OpenTdbTests(TestCase):
     def test_nonexistant_read(self):
+        self.assertRaises(IOError, tdb.Tdb, "/some/nonexistant/file", 0, tdb.DEFAULT, os.O_RDWR)
+        self.assertRaises(IOError, tdb.Tdb, "/some/nonexistant/file", 0,
+                tdb.DEFAULT, os.O_RDWR)
 class CloseTdbTests(TestCase):
     def test_double_close(self):
+        self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT, os.O_CREAT|os.O_RDWR)
+        self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT,
+                os.O_CREAT|os.O_RDWR)
         self.assertNotEqual(None, self.tdb)
 …
+class InternalTdbTests(TestCase):
+    def test_repr(self):
+        self.tdb = tdb.Tdb()
+        # repr used to crash on internal db
+        self.assertEquals(repr(self.tdb), "Tdb(<internal>)")
 class SimpleTdbTests(TestCase):
     def setUp(self):
         super(SimpleTdbTests, self).setUp()
+        self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT, os.O_CREAT|os.O_RDWR)
+        self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT,
+                os.O_CREAT|os.O_RDWR)
         self.assertNotEqual(None, self.tdb)
 …
         self.tdb.map_size
+    def test_freelist_size(self):
+        self.tdb.freelist_size
     def test_name(self):
         self.tdb.filename
 …
         self.assertEquals("1", self.tdb["bloe"])
     def test_iterator(self):
+    def test_transaction_prepare_commit(self):
         self.tdb["bloe"] = "2"
+        self.tdb["bla"] = "hoi"
+        i = iter(self.tdb)
+        self.assertEquals(set(["bloe", "bla"]), set([i.next(), i.next()]))
+        self.tdb.transaction_start()
+        self.tdb["bloe"] = "1"
+        self.tdb.transaction_prepare_commit()
+        self.tdb.transaction_commit()
+        self.assertEquals("1", self.tdb["bloe"])
     def test_iterkeys(self):
 …
         self.assertEquals(0, len(list(self.tdb)))
+    def test_repack(self):
+        self.tdb["foo"] = "abc"
+        self.tdb["bar"] = "def"
+        del self.tdb["foo"]
+        self.tdb.repack()
+    def test_seqnum(self):
+        self.tdb.enable_seqnum()
+        seq1 = self.tdb.seqnum
+        self.tdb.increment_seqnum_nonblock()
+        seq2 = self.tdb.seqnum
+        self.assertEquals(seq2-seq1, 1)
     def test_len(self):
         self.assertEquals(0, len(list(self.tdb)))
 …
         self.assertEquals(1, len(list(self.tdb)))
+    def test_add_flags(self):
+        self.tdb.add_flags(tdb.NOMMAP)
+        self.tdb.remove_flags(tdb.NOMMAP)
+class VersionTests(TestCase):
+    def test_present(self):
+        self.assertTrue(isinstance(tdb.__version__, str))
 if __name__ == '__main__':

trunk/server/lib/tdb/tdb.pc.in

r414	r745
7	7	Description: A trivial database
8	8	Version: @PACKAGE_VERSION@
9		Libs: -L${libdir} -ltdb
	9	Libs: @LIB_RPATH@ -L${libdir} -ltdb
10	10	Cflags: -I${includedir}
11	11	URL: http://tdb.samba.org/

trunk/server/lib/tdb/tools/tdbbackup.c

-              r414
+              r745
+        }
+        if (tdb_transaction_start(tdb_new) != 0) {
+                printf("Failed to start transaction on new tdb\n");
+        /* lock the backup tdb so that nobody else can change it */
+        if (tdb_lockall(tdb_new) != 0) {
+                printf("Failed to lock backup tdb\n");
                 tdb_close(tdb);
                 tdb_close(tdb_new);
 …
         tdb_close(tdb);
+        if (tdb_transaction_commit(tdb_new) != 0) {
+                fprintf(stderr, "Failed to commit new tdb\n");
+                tdb_close(tdb_new);
+                unlink(tmp_name);
+                free(tmp_name);
+                return 1;
+        /* copy done, unlock the backup tdb */
+        tdb_unlockall(tdb_new);
+#ifdef HAVE_FDATASYNC
+        if (fdatasync(tdb_fd(tdb_new)) != 0) {
+#else
+        if (fsync(tdb_fd(tdb_new)) != 0) {
+#endif
+                /* not fatal */
+                fprintf(stderr, "failed to fsync backup file\n");
+        }

trunk/server/lib/tdb/tools/tdbtest.c

-              r414
+              r745
+}
+static char *test_path(const char *filename)
+{
+        const char *prefix = getenv("TEST_DATA_PREFIX");
+        if (prefix) {
+                char *path = NULL;
+                int ret;
+                ret = asprintf(&path, "%s/%s", prefix, filename);
+                if (ret == -1) {
+                        return NULL;
+                }
+                return path;
+        }
+        return strdup(filename);
+}
  int main(int argc, const char *argv[])
+{
 …
         int loops = 10000;
         int num_entries;
+        char test_gdbm[] = "test.gdbm";
+        unlink("test.gdbm");
+        db = tdb_open("test.tdb", 0, TDB_CLEAR_IF_FIRST,
+        char test_gdbm[1] = "test.gdbm";
+        char *test_tdb;
+        test_gdbm[0] = test_path("test.gdbm");
+        test_tdb = test_path("test.tdb");
+        unlink(test_gdbm[0]);
+        db = tdb_open(test_tdb, 0, TDB_CLEAR_IF_FIRST,
                       O_RDWR | O_CREAT | O_TRUNC, 0600);
         gdbm = gdbm_open(test_gdbm, 512, GDBM_WRITER|GDBM_NEWDB|GDBM_FAST,
 …
         gdbm_close(gdbm);
+        free(test_gdbm[0]);
+        free(test_tdb);
         return 0;
+}

trunk/server/lib/tdb/tools/tdbtool.c

-              r456
+              r745
 static void info_tdb(void)
+{
         int count;
+        total_bytes = 0;
         if ((count = tdb_traverse(tdb, traverse_fn, NULL)) == -1)
+        char *summary = tdb_summary(tdb);
+        if (!summary) {
                 printf("Error = %s\n", tdb_errorstr(tdb));
+        else
+                printf("%d records totalling %d bytes\n", count, total_bytes);
+        } else {
+                printf("%s", summary);
+                free(summary);
+        }
+}
 …
     if (cmdname) {
 #endif
         if (cmdname && strlen(cmdname) == 0) {
                 mycmd = CMD_NEXT;

trunk/server/lib/tdb/tools/tdbtorture.c

-              r414
+              r745
 static int error_count;
 static int always_transaction = 0;
+static int hash_size = 2;
+static int loopnum;
+static int count_pipe;
+static struct tdb_logging_context log_ctx;
 #ifdef PRINTF_ATTRIBUTE
 …
         fflush(stdout);
 #if 0
+        {
+        if (level != TDB_DEBUG_TRACE) {
                 char *ptr;
+                signal(SIGUSR1, SIG_IGN);
                 asprintf(&ptr,"xterm -e gdb /proc/%d/exe %d", getpid(), getpid());
                 system(ptr);
 …
 static void usage(void)
+{
         printf("Usage: tdbtorture [-t] [-n NUM_PROCS] [-l NUM_LOOPS] [-s SEED] [-H HASH_SIZE]\n");
+        printf("Usage: tdbtorture [-t] [-k] [-n NUM_PROCS] [-l NUM_LOOPS] [-s SEED] [-H HASH_SIZE]\n");
         exit(0);
+}
+ int main(int argc, char * const *argv)
+{
+        int i, seed = -1;
+        int num_procs = 3;
+        int num_loops = 5000;
+        int hash_size = 2;
+        int c;
+        extern char *optarg;
+        pid_t *pids;
+        struct tdb_logging_context log_ctx;
+        log_ctx.log_fn = tdb_log;
+        while ((c = getopt(argc, argv, "n:l:s:H:th")) != -1) {
+                switch (c) {
+                case 'n':
+                        num_procs = strtol(optarg, NULL, 0);
+                        break;
+                case 'l':
+                        num_loops = strtol(optarg, NULL, 0);
+                        break;
+                case 'H':
+                        hash_size = strtol(optarg, NULL, 0);
+                        break;
+                case 's':
+                        seed = strtol(optarg, NULL, 0);
+                        break;
+                case 't':
+                        always_transaction = 1;
+                        break;
+                default:
+                        usage();
+                }
+        }
+        unlink("torture.tdb");
+        pids = (pid_t *)calloc(sizeof(pid_t), num_procs);
+        pids[0] = getpid();
+        for (i=0;i<num_procs-1;i++) {
+                if ((pids[i+1]=fork()) == 0) break;
+        }
+        db = tdb_open_ex("torture.tdb", hash_size, TDB_CLEAR_IF_FIRST,
+static void send_count_and_suicide(int sig)
+{
+        /* This ensures our successor can continue where we left off. */
+        write(count_pipe, &loopnum, sizeof(loopnum));
+        /* This gives a unique signature. */
+        kill(getpid(), SIGUSR2);
+}
+static int run_child(const char *filename, int i, int seed, unsigned num_loops, unsigned start)
+{
+        db = tdb_open_ex(filename, hash_size, TDB_DEFAULT,
                          O_RDWR | O_CREAT, 0600, &log_ctx, NULL);
         if (!db) {
 …
+        }
-        if (seed == -1) {
-                seed = (getpid() + time(NULL)) & 0x7FFFFFFF;
+        }
-        if (i == 0) {
-                printf("testing with %d processes, %d loops, %d hash_size, seed=%d%s\n",
-                       num_procs, num_loops, hash_size, seed, always_transaction ? " (all within transactions)" : "");
+        }
         srand(seed + i);
         srandom(seed + i);
+        for (i=0;i<num_loops && error_count == 0;i++) {
+        /* Set global, then we're ready to handle being killed. */
+        loopnum = start;
+        signal(SIGUSR1, send_count_and_suicide);
+        for (;loopnum<num_loops && error_count == 0;loopnum++) {
                 addrec_db();
+        }
 …
         tdb_close(db);
+        if (getpid() != pids[0]) {
+                return error_count;
+        }
+        for (i=1;i<num_procs;i++) {
+        return (error_count < 100 ? error_count : 100);
+}
+static char *test_path(const char *filename)
+{
+        const char *prefix = getenv("TEST_DATA_PREFIX");
+        if (prefix) {
+                char *path = NULL;
+                int ret;
+                ret = asprintf(&path, "%s/%s", prefix, filename);
+                if (ret == -1) {
+                        return NULL;
+                }
+                return path;
+        }
+        return strdup(filename);
+}
+int main(int argc, char * const *argv)
+{
+        int i, seed = -1;
+        int num_loops = 5000;
+        int num_procs = 3;
+        int c, pfds[2];
+        extern char *optarg;
+        pid_t *pids;
+        int kill_random = 0;
+        int *done;
+        char *test_tdb;
+        log_ctx.log_fn = tdb_log;
+        while ((c = getopt(argc, argv, "n:l:s:H:thk")) != -1) {
+                switch (c) {
+                case 'n':
+                        num_procs = strtol(optarg, NULL, 0);
+                        break;
+                case 'l':
+                        num_loops = strtol(optarg, NULL, 0);
+                        break;
+                case 'H':
+                        hash_size = strtol(optarg, NULL, 0);
+                        break;
+                case 's':
+                        seed = strtol(optarg, NULL, 0);
+                        break;
+                case 't':
+                        always_transaction = 1;
+                        break;
+                case 'k':
+                        kill_random = 1;
+                        break;
+                default:
+                        usage();
+                }
+        }
+        test_tdb = test_path("torture.tdb");
+        unlink(test_tdb);
+        if (seed == -1) {
+                seed = (getpid() + time(NULL)) & 0x7FFFFFFF;
+        }
+        if (num_procs == 1 && !kill_random) {
+                /* Don't fork for this case, makes debugging easier. */
+                error_count = run_child(test_tdb, 0, seed, num_loops, 0);
+                goto done;
+        }
+        pids = (pid_t *)calloc(sizeof(pid_t), num_procs);
+        done = (int *)calloc(sizeof(int), num_procs);
+        if (pipe(pfds) != 0) {
+                perror("Creating pipe");
+                exit(1);
+        }
+        count_pipe = pfds[1];
+        for (i=0;i<num_procs;i++) {
+                if ((pids[i]=fork()) == 0) {
+                        close(pfds[0]);
+                        if (i == 0) {
+                                printf("Testing with %d processes, %d loops, %d hash_size, seed=%d%s\n",
+                                       num_procs, num_loops, hash_size, seed, always_transaction ? " (all within transactions)" : "");
+                        }
+                        exit(run_child(test_tdb, i, seed, num_loops, 0));
+                }
+        }
+        while (num_procs) {
                 int status, j;
                 pid_t pid;
                 if (error_count != 0) {
                         /* try and stop the test on any failure */
                         for (j=1;j<num_procs;j++) {
+                        for (j=0;j<num_procs;j++) {
                                 if (pids[j] != 0) {
                                         kill(pids[j], SIGTERM);
 …
+                        }
+                }
+                pid = waitpid(-1, &status, 0);
+                pid = waitpid(-1, &status, kill_random ? WNOHANG : 0);
+                if (pid == 0) {
+                        struct timeval tv;
+                        /* Sleep for 1/10 second. */
+                        tv.tv_sec = 0;
+                        tv.tv_usec = 100000;
+                        select(0, NULL, NULL, NULL, &tv);
+                        /* Kill someone. */
+                        kill(pids[random() % num_procs], SIGUSR1);
+                        continue;
+                }
                 if (pid == -1) {
                         perror("failed to wait for child\n");
                         exit(1);
+                }
+                for (j=1;j<num_procs;j++) {
+                for (j=0;j<num_procs;j++) {
                         if (pids[j] == pid) break;
+                }
 …
                         exit(1);
+                }
+                if (WEXITSTATUS(status) != 0) {
+                        printf("child %d exited with status %d\n",
+                               (int)pid, WEXITSTATUS(status));
+                if (WIFSIGNALED(status)) {
+                        if (WTERMSIG(status) == SIGUSR2
+                            || WTERMSIG(status) == SIGUSR1) {
+                                /* SIGUSR2 means they wrote to pipe. */
+                                if (WTERMSIG(status) == SIGUSR2) {
+                                        read(pfds[0], &done[j],
+                                             sizeof(done[j]));
+                                }
+                                pids[j] = fork();
+                                if (pids[j] == 0)
+                                        exit(run_child(test_tdb, j, seed,
+                                                       num_loops, done[j]));
+                                printf("Restarting child %i for %u-%u\n",
+                                       j, done[j], num_loops);
+                                continue;
+                        }
+                        printf("child %d exited with signal %d\n",
+                               (int)pid, WTERMSIG(status));
                         error_count++;
+                }
+                pids[j] = 0;
+                } else {
+                        if (WEXITSTATUS(status) != 0) {
+                                printf("child %d exited with status %d\n",
+                                       (int)pid, WEXITSTATUS(status));
+                                error_count++;
+                        }
+                }
+                memmove(&pids[j], &pids[j+1],
+                        (num_procs - j - 1)*sizeof(pids[0]));
+                num_procs--;
+        }
         free(pids);
+done:
         if (error_count == 0) {
+                db = tdb_open_ex(test_tdb, hash_size, TDB_DEFAULT,
+                                 O_RDWR, 0, &log_ctx, NULL);
+                if (!db) {
+                        fatal("db open failed");
+                }
+                if (tdb_check(db, NULL, NULL) == -1) {
+                        printf("db check failed");
+                        exit(1);
+                }
+                tdb_close(db);
                 printf("OK\n");
+        }
+        free(test_tdb);
         return error_count;
+}

Context Navigation

Legend:

trunk/server

trunk/server/lib/tdb/common/check.c

trunk/server/lib/tdb/common/dump.c

trunk/server/lib/tdb/common/error.c

trunk/server/lib/tdb/common/freelist.c

trunk/server/lib/tdb/common/freelistcheck.c

trunk/server/lib/tdb/common/io.c

trunk/server/lib/tdb/common/lock.c

trunk/server/lib/tdb/common/open.c

trunk/server/lib/tdb/common/tdb.c

trunk/server/lib/tdb/common/tdb_private.h

trunk/server/lib/tdb/common/transaction.c

trunk/server/lib/tdb/common/traverse.c

trunk/server/lib/tdb/docs/README

trunk/server/lib/tdb/include/tdb.h

trunk/server/lib/tdb/libtdb.m4

trunk/server/lib/tdb/manpages/tdbbackup.8.xml

trunk/server/lib/tdb/manpages/tdbdump.8.xml

trunk/server/lib/tdb/manpages/tdbtool.8.xml

trunk/server/lib/tdb/pytdb.c

trunk/server/lib/tdb/python/tdbdump.py

trunk/server/lib/tdb/python/tests/simple.py

trunk/server/lib/tdb/tdb.pc.in

trunk/server/lib/tdb/tools/tdbbackup.c

trunk/server/lib/tdb/tools/tdbtest.c

trunk/server/lib/tdb/tools/tdbtool.c

trunk/server/lib/tdb/tools/tdbtorture.c

Download in other formats: