source: branches/samba-3.5.x/lib/tdb/common/io.c

Last change on this file was 847, checked in by Silvan Scherrer, 12 years ago

samba server 3.5: bring 3.5 code base in line with 3.6

File size: 14.1 KB
Line 
1 /*
2 Unix SMB/CIFS implementation.
3
4 trivial database library
5
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
9
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
12 ** under the LGPL
13
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
18
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
23
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
26*/
27
28
29#include "tdb_private.h"
30
31/* check for an out of bounds access - if it is out of bounds then
32 see if the database has been expanded by someone else and expand
33 if necessary
34 note that "len" is the minimum length needed for the db
35*/
36static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
37{
38 struct stat st;
39 if (len <= tdb->map_size)
40 return 0;
41 if (tdb->flags & TDB_INTERNAL) {
42 if (!probe) {
43 /* Ensure ecode is set for log fn. */
44 tdb->ecode = TDB_ERR_IO;
45 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %d beyond internal malloc size %d\n",
46 (int)len, (int)tdb->map_size));
47 }
48 return -1;
49 }
50
51 if (fstat(tdb->fd, &st) == -1) {
52 tdb->ecode = TDB_ERR_IO;
53 return -1;
54 }
55
56 if (st.st_size < (size_t)len) {
57 if (!probe) {
58 /* Ensure ecode is set for log fn. */
59 tdb->ecode = TDB_ERR_IO;
60 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %d beyond eof at %d\n",
61 (int)len, (int)st.st_size));
62 }
63 return -1;
64 }
65
66 /* Unmap, update size, remap */
67 if (tdb_munmap(tdb) == -1) {
68 tdb->ecode = TDB_ERR_IO;
69 return -1;
70 }
71 tdb->map_size = st.st_size;
72 tdb_mmap(tdb);
73 return 0;
74}
75
76/* write a lump of data at a specified offset */
77static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
78 const void *buf, tdb_len_t len)
79{
80 if (len == 0) {
81 return 0;
82 }
83
84 if (tdb->read_only || tdb->traverse_read) {
85 tdb->ecode = TDB_ERR_RDONLY;
86 return -1;
87 }
88
89 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
90 return -1;
91
92#ifdef __OS2__
93 // YD we must upgrade read locks to write locks (exclusive), as otherwise
94 // the owner (us) is not allowed to write to the file (different from unix)
95 // if a wider previous lock is in effect, we cannot write lock our segment
96 // (e.g. a lock_upgrade locks all the file), so we hope the previous lock
97 // is a write lock: do not wait for lock.
98 // so this is what we try here:
99 // 1. add a write lock and see if it works
100 // 2. if the write lock wasn't set, we try to unlock the segment
101 // first and add the write lock afterwards
102 int upgradeLockRC = 0;
103 upgradeLockRC = tdb_brlock( tdb, off, F_WRLCK, F_SETLK, 0, len);
104 if (upgradeLockRC != 0) {
105 tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0, len);
106 upgradeLockRC = tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 0, len);
107 }
108 TDB_LOG((tdb, TDB_DEBUG_TRACE,"upgrading lock at %d len=%d "
109 "before writing %s (rc=%d).\n", off, len,
110 upgradeLockRC ? "failed":"was successfull", upgradeLockRC));
111#endif
112
113 if (tdb->map_ptr) {
114 memcpy(off + (char *)tdb->map_ptr, buf, len);
115 } else {
116 ssize_t written = pwrite(tdb->fd, buf, len, off);
117 if ((written != (ssize_t)len) && (written != -1)) {
118 /* try once more */
119 tdb->ecode = TDB_ERR_IO;
120 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
121 "%d of %d bytes at %d, trying once more\n",
122 (int)written, len, off));
123 written = pwrite(tdb->fd, (const char *)buf+written,
124 len-written,
125 off+written);
126 }
127 if (written == -1) {
128 /* Ensure ecode is set for log fn. */
129 tdb->ecode = TDB_ERR_IO;
130 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
131 "len=%d (%s)\n", off, len, strerror(errno)));
132 return -1;
133 } else if (written != (ssize_t)len) {
134 tdb->ecode = TDB_ERR_IO;
135 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
136 "write %d bytes at %d in two attempts\n",
137 len, off));
138
139#ifdef __OS2__ // remove our lock, if upgrade succeded
140 if (upgradeLockRC == 0)
141 tdb_brlock( tdb, off, F_UNLCK, F_SETLK, 0, len);
142#endif
143 return -1;
144 }
145 }
146
147#ifdef __OS2__ // remove our lock, if upgrade succeded
148 if (upgradeLockRC == 0)
149 tdb_brlock( tdb, off, F_UNLCK, F_SETLK, 0, len);
150#endif
151
152 return 0;
153}
154
155/* Endian conversion: we only ever deal with 4 byte quantities */
156void *tdb_convert(void *buf, uint32_t size)
157{
158 uint32_t i, *p = (uint32_t *)buf;
159 for (i = 0; i < size / 4; i++)
160 p[i] = TDB_BYTEREV(p[i]);
161 return buf;
162}
163
164
165/* read a lump of data at a specified offset, maybe convert */
166static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
167 tdb_len_t len, int cv)
168{
169 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0) {
170 return -1;
171 }
172
173 if (tdb->map_ptr) {
174 memcpy(buf, off + (char *)tdb->map_ptr, len);
175 } else {
176 ssize_t ret = pread(tdb->fd, buf, len, off);
177 if (ret != (ssize_t)len) {
178 /* Ensure ecode is set for log fn. */
179 tdb->ecode = TDB_ERR_IO;
180 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %d "
181 "len=%d ret=%d (%s) map_size=%d\n",
182 (int)off, (int)len, (int)ret, strerror(errno),
183 (int)tdb->map_size));
184 return -1;
185 }
186 }
187 if (cv) {
188 tdb_convert(buf, len);
189 }
190 return 0;
191}
192
193
194
195/*
196 do an unlocked scan of the hash table heads to find the next non-zero head. The value
197 will then be confirmed with the lock held
198*/
199static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
200{
201 uint32_t h = *chain;
202 if (tdb->map_ptr) {
203 for (;h < tdb->header.hash_size;h++) {
204 if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
205 break;
206 }
207 }
208 } else {
209 uint32_t off=0;
210 for (;h < tdb->header.hash_size;h++) {
211 if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
212 break;
213 }
214 }
215 }
216 (*chain) = h;
217}
218
219
220int tdb_munmap(struct tdb_context *tdb)
221{
222 if (tdb->flags & TDB_INTERNAL)
223 return 0;
224
225#ifdef HAVE_MMAP
226 if (tdb->map_ptr) {
227 int ret;
228
229 ret = munmap(tdb->map_ptr, tdb->map_size);
230 if (ret != 0)
231 return ret;
232 }
233#endif
234 tdb->map_ptr = NULL;
235 return 0;
236}
237
238void tdb_mmap(struct tdb_context *tdb)
239{
240 if (tdb->flags & TDB_INTERNAL)
241 return;
242
243#ifdef HAVE_MMAP
244 if (!(tdb->flags & TDB_NOMMAP)) {
245 tdb->map_ptr = mmap(NULL, tdb->map_size,
246 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
247 MAP_SHARED|MAP_FILE, tdb->fd, 0);
248
249 /*
250 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
251 */
252
253 if (tdb->map_ptr == MAP_FAILED) {
254 tdb->map_ptr = NULL;
255 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %d (%s)\n",
256 tdb->map_size, strerror(errno)));
257 }
258 } else {
259 tdb->map_ptr = NULL;
260 }
261#else
262 tdb->map_ptr = NULL;
263#endif
264}
265
266/* expand a file. we prefer to use ftruncate, as that is what posix
267 says to use for mmap expansion */
268static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
269{
270 char buf[8192];
271
272 if (tdb->read_only || tdb->traverse_read) {
273 tdb->ecode = TDB_ERR_RDONLY;
274 return -1;
275 }
276
277 if (ftruncate(tdb->fd, size+addition) == -1) {
278 char b = 0;
279 ssize_t written = pwrite(tdb->fd, &b, 1, (size+addition) - 1);
280 if (written == 0) {
281 /* try once more, potentially revealing errno */
282 written = pwrite(tdb->fd, &b, 1, (size+addition) - 1);
283 }
284 if (written == 0) {
285 /* again - give up, guessing errno */
286 errno = ENOSPC;
287 }
288 if (written != 1) {
289 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %d failed (%s)\n",
290 size+addition, strerror(errno)));
291 return -1;
292 }
293 }
294
295 /* now fill the file with something. This ensures that the
296 file isn't sparse, which would be very bad if we ran out of
297 disk. This must be done with write, not via mmap */
298 memset(buf, TDB_PAD_BYTE, sizeof(buf));
299 while (addition) {
300 size_t n = addition>sizeof(buf)?sizeof(buf):addition;
301 ssize_t written = pwrite(tdb->fd, buf, n, size);
302 if (written == 0) {
303 /* prevent infinite loops: try _once_ more */
304 written = pwrite(tdb->fd, buf, n, size);
305 }
306 if (written == 0) {
307 /* give up, trying to provide a useful errno */
308 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
309 "returned 0 twice: giving up!\n"));
310 errno = ENOSPC;
311 return -1;
312 } else if (written == -1) {
313 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of "
314 "%d bytes failed (%s)\n", (int)n,
315 strerror(errno)));
316 return -1;
317 } else if (written != n) {
318 TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote "
319 "only %d of %d bytes - retrying\n", (int)written,
320 (int)n));
321 }
322 addition -= written;
323 size += written;
324 }
325 return 0;
326}
327
328
329/* expand the database at least size bytes by expanding the underlying
330 file and doing the mmap again if necessary */
331int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
332{
333 struct tdb_record rec;
334 tdb_off_t offset, new_size, top_size, map_size;
335
336 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
337 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
338 return -1;
339 }
340
341 /* must know about any previous expansions by another process */
342 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
343
344 /* limit size in order to avoid using up huge amounts of memory for
345 * in memory tdbs if an oddball huge record creeps in */
346 if (size > 100 * 1024) {
347 top_size = tdb->map_size + size * 2;
348 } else {
349 top_size = tdb->map_size + size * 100;
350 }
351
352 /* always make room for at least top_size more records, and at
353 least 25% more space. if the DB is smaller than 100MiB,
354 otherwise grow it by 10% only. */
355 if (tdb->map_size > 100 * 1024 * 1024) {
356 map_size = tdb->map_size * 1.10;
357 } else {
358 map_size = tdb->map_size * 1.25;
359 }
360
361 /* Round the database up to a multiple of the page size */
362 new_size = MAX(top_size, map_size);
363 size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size;
364
365 if (!(tdb->flags & TDB_INTERNAL))
366 tdb_munmap(tdb);
367
368 /*
369 * We must ensure the file is unmapped before doing this
370 * to ensure consistency with systems like OpenBSD where
371 * writes and mmaps are not consistent.
372 */
373
374 /* expand the file itself */
375 if (!(tdb->flags & TDB_INTERNAL)) {
376 if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
377 goto fail;
378 }
379
380 tdb->map_size += size;
381
382 if (tdb->flags & TDB_INTERNAL) {
383 char *new_map_ptr = (char *)realloc(tdb->map_ptr,
384 tdb->map_size);
385 if (!new_map_ptr) {
386 tdb->map_size -= size;
387 goto fail;
388 }
389 tdb->map_ptr = new_map_ptr;
390 } else {
391 /*
392 * We must ensure the file is remapped before adding the space
393 * to ensure consistency with systems like OpenBSD where
394 * writes and mmaps are not consistent.
395 */
396
397 /* We're ok if the mmap fails as we'll fallback to read/write */
398 tdb_mmap(tdb);
399 }
400
401 /* form a new freelist record */
402 memset(&rec,'\0',sizeof(rec));
403 rec.rec_len = size - sizeof(rec);
404
405 /* link it into the free list */
406 offset = tdb->map_size - size;
407 if (tdb_free(tdb, offset, &rec) == -1)
408 goto fail;
409
410 tdb_unlock(tdb, -1, F_WRLCK);
411 return 0;
412 fail:
413 tdb_unlock(tdb, -1, F_WRLCK);
414 return -1;
415}
416
417/* read/write a tdb_off_t */
418int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
419{
420 return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
421}
422
423int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
424{
425 tdb_off_t off = *d;
426 return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
427}
428
429
430/* read a lump of data, allocating the space for it */
431unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
432{
433 unsigned char *buf;
434
435 /* some systems don't like zero length malloc */
436
437 if (!(buf = (unsigned char *)malloc(len ? len : 1))) {
438 /* Ensure ecode is set for log fn. */
439 tdb->ecode = TDB_ERR_OOM;
440 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%d (%s)\n",
441 len, strerror(errno)));
442 return NULL;
443 }
444 if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
445 SAFE_FREE(buf);
446 return NULL;
447 }
448 return buf;
449}
450
451/* Give a piece of tdb data to a parser */
452
453int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
454 tdb_off_t offset, tdb_len_t len,
455 int (*parser)(TDB_DATA key, TDB_DATA data,
456 void *private_data),
457 void *private_data)
458{
459 TDB_DATA data;
460 int result;
461
462 data.dsize = len;
463
464 if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
465 /*
466 * Optimize by avoiding the malloc/memcpy/free, point the
467 * parser directly at the mmap area.
468 */
469 if (tdb->methods->tdb_oob(tdb, offset+len, 0) != 0) {
470 return -1;
471 }
472 data.dptr = offset + (unsigned char *)tdb->map_ptr;
473 return parser(key, data, private_data);
474 }
475
476 if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
477 return -1;
478 }
479
480 result = parser(key, data, private_data);
481 free(data.dptr);
482 return result;
483}
484
485/* read/write a record */
486int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
487{
488 if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
489 return -1;
490 if (TDB_BAD_MAGIC(rec)) {
491 /* Ensure ecode is set for log fn. */
492 tdb->ecode = TDB_ERR_CORRUPT;
493 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
494 return -1;
495 }
496 return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
497}
498
499int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
500{
501 struct tdb_record r = *rec;
502 return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
503}
504
505static const struct tdb_methods io_methods = {
506 tdb_read,
507 tdb_write,
508 tdb_next_hash_chain,
509 tdb_oob,
510 tdb_expand_file,
511 tdb_brlock
512};
513
514/*
515 initialise the default methods table
516*/
517void tdb_io_init(struct tdb_context *tdb)
518{
519 tdb->methods = &io_methods;
520}
Note: See TracBrowser for help on using the repository browser.