source: trunk/src/3rdparty/sqlite/btree.c@ 205

Last change on this file since 205 was 205, checked in by rudi, 14 years ago

Added SQLite 2.8.17 sources. This allows to build at least one of the sql drivers / plugins.

File size: 110.2 KB
Line 
1/*
2** 2001 September 15
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** $Id: btree.c,v 1.103 2004/03/10 13:42:38 drh Exp $
13**
14** This file implements a external (disk-based) database using BTrees.
15** For a detailed discussion of BTrees, refer to
16**
17** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18** "Sorting And Searching", pages 473-480. Addison-Wesley
19** Publishing Company, Reading, Massachusetts.
20**
21** The basic idea is that each page of the file contains N database
22** entries and N+1 pointers to subpages.
23**
24** ----------------------------------------------------------------
25** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26** ----------------------------------------------------------------
27**
28** All of the keys on the page that Ptr(0) points to have values less
29** than Key(0). All of the keys on page Ptr(1) and its subpages have
30** values greater than Key(0) and less than Key(1). All of the keys
31** on Ptr(N+1) and its subpages have values greater than Key(N). And
32** so forth.
33**
34** Finding a particular key requires reading O(log(M)) pages from the
35** disk where M is the number of entries in the tree.
36**
37** In this implementation, a single file can hold one or more separate
38** BTrees. Each BTree is identified by the index of its root page. The
39** key and data for any entry are combined to form the "payload". Up to
40** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the
41** database page. If the payload is larger than MX_LOCAL_PAYLOAD bytes
42** then surplus bytes are stored on overflow pages. The payload for an
43** entry and the preceding pointer are combined to form a "Cell". Each
44** page has a small header which contains the Ptr(N+1) pointer.
45**
46** The first page of the file contains a magic string used to verify that
47** the file really is a valid BTree database, a pointer to a list of unused
48** pages in the file, and some meta information. The root of the first
49** BTree begins on page 2 of the file. (Pages are numbered beginning with
50** 1, not 0.) Thus a minimum database contains 2 pages.
51*/
52#include "sqliteInt.h"
53#include "pager.h"
54#include "btree.h"
55#include <assert.h>
56
57/* Forward declarations */
58static BtOps sqliteBtreeOps;
59static BtCursorOps sqliteBtreeCursorOps;
60
61/*
62** Macros used for byteswapping. B is a pointer to the Btree
63** structure. This is needed to access the Btree.needSwab boolean
64** in order to tell if byte swapping is needed or not.
65** X is an unsigned integer. SWAB16 byte swaps a 16-bit integer.
66** SWAB32 byteswaps a 32-bit integer.
67*/
68#define SWAB16(B,X) ((B)->needSwab? swab16((u16)X) : ((u16)X))
69#define SWAB32(B,X) ((B)->needSwab? swab32(X) : (X))
70#define SWAB_ADD(B,X,A) \
71 if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); }
72
73/*
74** The following global variable - available only if SQLITE_TEST is
75** defined - is used to determine whether new databases are created in
76** native byte order or in non-native byte order. Non-native byte order
77** databases are created for testing purposes only. Under normal operation,
78** only native byte-order databases should be created, but we should be
79** able to read or write existing databases regardless of the byteorder.
80*/
81#ifdef SQLITE_TEST
82int btree_native_byte_order = 1;
83#else
84# define btree_native_byte_order 1
85#endif
86
87/*
88** Forward declarations of structures used only in this file.
89*/
90typedef struct PageOne PageOne;
91typedef struct MemPage MemPage;
92typedef struct PageHdr PageHdr;
93typedef struct Cell Cell;
94typedef struct CellHdr CellHdr;
95typedef struct FreeBlk FreeBlk;
96typedef struct OverflowPage OverflowPage;
97typedef struct FreelistInfo FreelistInfo;
98
99/*
100** All structures on a database page are aligned to 4-byte boundries.
101** This routine rounds up a number of bytes to the next multiple of 4.
102**
103** This might need to change for computer architectures that require
104** and 8-byte alignment boundry for structures.
105*/
106#define ROUNDUP(X) ((X+3) & ~3)
107
108/*
109** This is a magic string that appears at the beginning of every
110** SQLite database in order to identify the file as a real database.
111*/
112static const char zMagicHeader[] =
113 "** This file contains an SQLite 2.1 database **";
114#define MAGIC_SIZE (sizeof(zMagicHeader))
115
116/*
117** This is a magic integer also used to test the integrity of the database
118** file. This integer is used in addition to the string above so that
119** if the file is written on a little-endian architecture and read
120** on a big-endian architectures (or vice versa) we can detect the
121** problem.
122**
123** The number used was obtained at random and has no special
124** significance other than the fact that it represents a different
125** integer on little-endian and big-endian machines.
126*/
127#define MAGIC 0xdae37528
128
129/*
130** The first page of the database file contains a magic header string
131** to identify the file as an SQLite database file. It also contains
132** a pointer to the first free page of the file. Page 2 contains the
133** root of the principle BTree. The file might contain other BTrees
134** rooted on pages above 2.
135**
136** The first page also contains SQLITE_N_BTREE_META integers that
137** can be used by higher-level routines.
138**
139** Remember that pages are numbered beginning with 1. (See pager.c
140** for additional information.) Page 0 does not exist and a page
141** number of 0 is used to mean "no such page".
142*/
143struct PageOne {
144 char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */
145 int iMagic; /* Integer to verify correct byte order */
146 Pgno freeList; /* First free page in a list of all free pages */
147 int nFree; /* Number of pages on the free list */
148 int aMeta[SQLITE_N_BTREE_META-1]; /* User defined integers */
149};
150
151/*
152** Each database page has a header that is an instance of this
153** structure.
154**
155** PageHdr.firstFree is 0 if there is no free space on this page.
156** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a
157** FreeBlk structure that describes the first block of free space.
158** All free space is defined by a linked list of FreeBlk structures.
159**
160** Data is stored in a linked list of Cell structures. PageHdr.firstCell
161** is the index into MemPage.u.aDisk[] of the first cell on the page. The
162** Cells are kept in sorted order.
163**
164** A Cell contains all information about a database entry and a pointer
165** to a child page that contains other entries less than itself. In
166** other words, the i-th Cell contains both Ptr(i) and Key(i). The
167** right-most pointer of the page is contained in PageHdr.rightChild.
168*/
169struct PageHdr {
170 Pgno rightChild; /* Child page that comes after all cells on this page */
171 u16 firstCell; /* Index in MemPage.u.aDisk[] of the first cell */
172 u16 firstFree; /* Index in MemPage.u.aDisk[] of the first free block */
173};
174
175/*
176** Entries on a page of the database are called "Cells". Each Cell
177** has a header and data. This structure defines the header. The
178** key and data (collectively the "payload") follow this header on
179** the database page.
180**
181** A definition of the complete Cell structure is given below. The
182** header for the cell must be defined first in order to do some
183** of the sizing #defines that follow.
184*/
185struct CellHdr {
186 Pgno leftChild; /* Child page that comes before this cell */
187 u16 nKey; /* Number of bytes in the key */
188 u16 iNext; /* Index in MemPage.u.aDisk[] of next cell in sorted order */
189 u8 nKeyHi; /* Upper 8 bits of key size for keys larger than 64K bytes */
190 u8 nDataHi; /* Upper 8 bits of data size when the size is more than 64K */
191 u16 nData; /* Number of bytes of data */
192};
193
194/*
195** The key and data size are split into a lower 16-bit segment and an
196** upper 8-bit segment in order to pack them together into a smaller
197** space. The following macros reassembly a key or data size back
198** into an integer.
199*/
200#define NKEY(b,h) (SWAB16(b,h.nKey) + h.nKeyHi*65536)
201#define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536)
202
203/*
204** The minimum size of a complete Cell. The Cell must contain a header
205** and at least 4 bytes of payload.
206*/
207#define MIN_CELL_SIZE (sizeof(CellHdr)+4)
208
209/*
210** The maximum number of database entries that can be held in a single
211** page of the database.
212*/
213#define MX_CELL ((SQLITE_USABLE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE)
214
215/*
216** The amount of usable space on a single page of the BTree. This is the
217** page size minus the overhead of the page header.
218*/
219#define USABLE_SPACE (SQLITE_USABLE_SIZE - sizeof(PageHdr))
220
221/*
222** The maximum amount of payload (in bytes) that can be stored locally for
223** a database entry. If the entry contains more data than this, the
224** extra goes onto overflow pages.
225**
226** This number is chosen so that at least 4 cells will fit on every page.
227*/
228#define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3)
229
230/*
231** Data on a database page is stored as a linked list of Cell structures.
232** Both the key and the data are stored in aPayload[]. The key always comes
233** first. The aPayload[] field grows as necessary to hold the key and data,
234** up to a maximum of MX_LOCAL_PAYLOAD bytes. If the size of the key and
235** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the
236** page number of the first overflow page.
237**
238** Though this structure is fixed in size, the Cell on the database
239** page varies in size. Every cell has a CellHdr and at least 4 bytes
240** of payload space. Additional payload bytes (up to the maximum of
241** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as
242** needed.
243*/
244struct Cell {
245 CellHdr h; /* The cell header */
246 char aPayload[MX_LOCAL_PAYLOAD]; /* Key and data */
247 Pgno ovfl; /* The first overflow page */
248};
249
250/*
251** Free space on a page is remembered using a linked list of the FreeBlk
252** structures. Space on a database page is allocated in increments of
253** at least 4 bytes and is always aligned to a 4-byte boundry. The
254** linked list of FreeBlks is always kept in order by address.
255*/
256struct FreeBlk {
257 u16 iSize; /* Number of bytes in this block of free space */
258 u16 iNext; /* Index in MemPage.u.aDisk[] of the next free block */
259};
260
261/*
262** The number of bytes of payload that will fit on a single overflow page.
263*/
264#define OVERFLOW_SIZE (SQLITE_USABLE_SIZE-sizeof(Pgno))
265
266/*
267** When the key and data for a single entry in the BTree will not fit in
268** the MX_LOCAL_PAYLOAD bytes of space available on the database page,
269** then all extra bytes are written to a linked list of overflow pages.
270** Each overflow page is an instance of the following structure.
271**
272** Unused pages in the database are also represented by instances of
273** the OverflowPage structure. The PageOne.freeList field is the
274** page number of the first page in a linked list of unused database
275** pages.
276*/
277struct OverflowPage {
278 Pgno iNext;
279 char aPayload[OVERFLOW_SIZE];
280};
281
282/*
283** The PageOne.freeList field points to a linked list of overflow pages
284** hold information about free pages. The aPayload section of each
285** overflow page contains an instance of the following structure. The
286** aFree[] array holds the page number of nFree unused pages in the disk
287** file.
288*/
289struct FreelistInfo {
290 int nFree;
291 Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)];
292};
293
294/*
295** For every page in the database file, an instance of the following structure
296** is stored in memory. The u.aDisk[] array contains the raw bits read from
297** the disk. The rest is auxiliary information held in memory only. The
298** auxiliary info is only valid for regular database pages - it is not
299** used for overflow pages and pages on the freelist.
300**
301** Of particular interest in the auxiliary info is the apCell[] entry. Each
302** apCell[] entry is a pointer to a Cell structure in u.aDisk[]. The cells are
303** put in this array so that they can be accessed in constant time, rather
304** than in linear time which would be needed if we had to walk the linked
305** list on every access.
306**
307** Note that apCell[] contains enough space to hold up to two more Cells
308** than can possibly fit on one page. In the steady state, every apCell[]
309** points to memory inside u.aDisk[]. But in the middle of an insert
310** operation, some apCell[] entries may temporarily point to data space
311** outside of u.aDisk[]. This is a transient situation that is quickly
312** resolved. But while it is happening, it is possible for a database
313** page to hold as many as two more cells than it might otherwise hold.
314** The extra two entries in apCell[] are an allowance for this situation.
315**
316** The pParent field points back to the parent page. This allows us to
317** walk up the BTree from any leaf to the root. Care must be taken to
318** unref() the parent page pointer when this page is no longer referenced.
319** The pageDestructor() routine handles that chore.
320*/
321struct MemPage {
322 union u_page_data {
323 char aDisk[SQLITE_PAGE_SIZE]; /* Page data stored on disk */
324 PageHdr hdr; /* Overlay page header */
325 } u;
326 u8 isInit; /* True if auxiliary data is initialized */
327 u8 idxShift; /* True if apCell[] indices have changed */
328 u8 isOverfull; /* Some apCell[] points outside u.aDisk[] */
329 MemPage *pParent; /* The parent of this page. NULL for root */
330 int idxParent; /* Index in pParent->apCell[] of this node */
331 int nFree; /* Number of free bytes in u.aDisk[] */
332 int nCell; /* Number of entries on this page */
333 Cell *apCell[MX_CELL+2]; /* All data entires in sorted order */
334};
335
336/*
337** The in-memory image of a disk page has the auxiliary information appended
338** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
339** that extra information.
340*/
341#define EXTRA_SIZE (sizeof(MemPage)-sizeof(union u_page_data))
342
343/*
344** Everything we need to know about an open database
345*/
346struct Btree {
347 BtOps *pOps; /* Function table */
348 Pager *pPager; /* The page cache */
349 BtCursor *pCursor; /* A list of all open cursors */
350 PageOne *page1; /* First page of the database */
351 u8 inTrans; /* True if a transaction is in progress */
352 u8 inCkpt; /* True if there is a checkpoint on the transaction */
353 u8 readOnly; /* True if the underlying file is readonly */
354 u8 needSwab; /* Need to byte-swapping */
355};
356typedef Btree Bt;
357
358/*
359** A cursor is a pointer to a particular entry in the BTree.
360** The entry is identified by its MemPage and the index in
361** MemPage.apCell[] of the entry.
362*/
363struct BtCursor {
364 BtCursorOps *pOps; /* Function table */
365 Btree *pBt; /* The Btree to which this cursor belongs */
366 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
367 BtCursor *pShared; /* Loop of cursors with the same root page */
368 Pgno pgnoRoot; /* The root page of this tree */
369 MemPage *pPage; /* Page that contains the entry */
370 int idx; /* Index of the entry in pPage->apCell[] */
371 u8 wrFlag; /* True if writable */
372 u8 eSkip; /* Determines if next step operation is a no-op */
373 u8 iMatch; /* compare result from last sqliteBtreeMoveto() */
374};
375
376/*
377** Legal values for BtCursor.eSkip.
378*/
379#define SKIP_NONE 0 /* Always step the cursor */
380#define SKIP_NEXT 1 /* The next sqliteBtreeNext() is a no-op */
381#define SKIP_PREV 2 /* The next sqliteBtreePrevious() is a no-op */
382#define SKIP_INVALID 3 /* Calls to Next() and Previous() are invalid */
383
384/* Forward declarations */
385static int fileBtreeCloseCursor(BtCursor *pCur);
386
387/*
388** Routines for byte swapping.
389*/
390u16 swab16(u16 x){
391 return ((x & 0xff)<<8) | ((x>>8)&0xff);
392}
393u32 swab32(u32 x){
394 return ((x & 0xff)<<24) | ((x & 0xff00)<<8) |
395 ((x>>8) & 0xff00) | ((x>>24)&0xff);
396}
397
398/*
399** Compute the total number of bytes that a Cell needs on the main
400** database page. The number returned includes the Cell header,
401** local payload storage, and the pointer to overflow pages (if
402** applicable). Additional space allocated on overflow pages
403** is NOT included in the value returned from this routine.
404*/
405static int cellSize(Btree *pBt, Cell *pCell){
406 int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
407 if( n>MX_LOCAL_PAYLOAD ){
408 n = MX_LOCAL_PAYLOAD + sizeof(Pgno);
409 }else{
410 n = ROUNDUP(n);
411 }
412 n += sizeof(CellHdr);
413 return n;
414}
415
416/*
417** Defragment the page given. All Cells are moved to the
418** beginning of the page and all free space is collected
419** into one big FreeBlk at the end of the page.
420*/
421static void defragmentPage(Btree *pBt, MemPage *pPage){
422 int pc, i, n;
423 FreeBlk *pFBlk;
424 char newPage[SQLITE_USABLE_SIZE];
425
426 assert( sqlitepager_iswriteable(pPage) );
427 assert( pPage->isInit );
428 pc = sizeof(PageHdr);
429 pPage->u.hdr.firstCell = SWAB16(pBt, pc);
430 memcpy(newPage, pPage->u.aDisk, pc);
431 for(i=0; i<pPage->nCell; i++){
432 Cell *pCell = pPage->apCell[i];
433
434 /* This routine should never be called on an overfull page. The
435 ** following asserts verify that constraint. */
436 assert( Addr(pCell) > Addr(pPage) );
437 assert( Addr(pCell) < Addr(pPage) + SQLITE_USABLE_SIZE );
438
439 n = cellSize(pBt, pCell);
440 pCell->h.iNext = SWAB16(pBt, pc + n);
441 memcpy(&newPage[pc], pCell, n);
442 pPage->apCell[i] = (Cell*)&pPage->u.aDisk[pc];
443 pc += n;
444 }
445 assert( pPage->nFree==SQLITE_USABLE_SIZE-pc );
446 memcpy(pPage->u.aDisk, newPage, pc);
447 if( pPage->nCell>0 ){
448 pPage->apCell[pPage->nCell-1]->h.iNext = 0;
449 }
450 pFBlk = (FreeBlk*)&pPage->u.aDisk[pc];
451 pFBlk->iSize = SWAB16(pBt, SQLITE_USABLE_SIZE - pc);
452 pFBlk->iNext = 0;
453 pPage->u.hdr.firstFree = SWAB16(pBt, pc);
454 memset(&pFBlk[1], 0, SQLITE_USABLE_SIZE - pc - sizeof(FreeBlk));
455}
456
457/*
458** Allocate nByte bytes of space on a page. nByte must be a
459** multiple of 4.
460**
461** Return the index into pPage->u.aDisk[] of the first byte of
462** the new allocation. Or return 0 if there is not enough free
463** space on the page to satisfy the allocation request.
464**
465** If the page contains nBytes of free space but does not contain
466** nBytes of contiguous free space, then this routine automatically
467** calls defragementPage() to consolidate all free space before
468** allocating the new chunk.
469*/
470static int allocateSpace(Btree *pBt, MemPage *pPage, int nByte){
471 FreeBlk *p;
472 u16 *pIdx;
473 int start;
474 int iSize;
475#ifndef NDEBUG
476 int cnt = 0;
477#endif
478
479 assert( sqlitepager_iswriteable(pPage) );
480 assert( nByte==ROUNDUP(nByte) );
481 assert( pPage->isInit );
482 if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
483 pIdx = &pPage->u.hdr.firstFree;
484 p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)];
485 while( (iSize = SWAB16(pBt, p->iSize))<nByte ){
486 assert( cnt++ < SQLITE_USABLE_SIZE/4 );
487 if( p->iNext==0 ){
488 defragmentPage(pBt, pPage);
489 pIdx = &pPage->u.hdr.firstFree;
490 }else{
491 pIdx = &p->iNext;
492 }
493 p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)];
494 }
495 if( iSize==nByte ){
496 start = SWAB16(pBt, *pIdx);
497 *pIdx = p->iNext;
498 }else{
499 FreeBlk *pNew;
500 start = SWAB16(pBt, *pIdx);
501 pNew = (FreeBlk*)&pPage->u.aDisk[start + nByte];
502 pNew->iNext = p->iNext;
503 pNew->iSize = SWAB16(pBt, iSize - nByte);
504 *pIdx = SWAB16(pBt, start + nByte);
505 }
506 pPage->nFree -= nByte;
507 return start;
508}
509
510/*
511** Return a section of the MemPage.u.aDisk[] to the freelist.
512** The first byte of the new free block is pPage->u.aDisk[start]
513** and the size of the block is "size" bytes. Size must be
514** a multiple of 4.
515**
516** Most of the effort here is involved in coalesing adjacent
517** free blocks into a single big free block.
518*/
519static void freeSpace(Btree *pBt, MemPage *pPage, int start, int size){
520 int end = start + size;
521 u16 *pIdx, idx;
522 FreeBlk *pFBlk;
523 FreeBlk *pNew;
524 FreeBlk *pNext;
525 int iSize;
526
527 assert( sqlitepager_iswriteable(pPage) );
528 assert( size == ROUNDUP(size) );
529 assert( start == ROUNDUP(start) );
530 assert( pPage->isInit );
531 pIdx = &pPage->u.hdr.firstFree;
532 idx = SWAB16(pBt, *pIdx);
533 while( idx!=0 && idx<start ){
534 pFBlk = (FreeBlk*)&pPage->u.aDisk[idx];
535 iSize = SWAB16(pBt, pFBlk->iSize);
536 if( idx + iSize == start ){
537 pFBlk->iSize = SWAB16(pBt, iSize + size);
538 if( idx + iSize + size == SWAB16(pBt, pFBlk->iNext) ){
539 pNext = (FreeBlk*)&pPage->u.aDisk[idx + iSize + size];
540 if( pBt->needSwab ){
541 pFBlk->iSize = swab16((u16)swab16(pNext->iSize)+iSize+size);
542 }else{
543 pFBlk->iSize += pNext->iSize;
544 }
545 pFBlk->iNext = pNext->iNext;
546 }
547 pPage->nFree += size;
548 return;
549 }
550 pIdx = &pFBlk->iNext;
551 idx = SWAB16(pBt, *pIdx);
552 }
553 pNew = (FreeBlk*)&pPage->u.aDisk[start];
554 if( idx != end ){
555 pNew->iSize = SWAB16(pBt, size);
556 pNew->iNext = SWAB16(pBt, idx);
557 }else{
558 pNext = (FreeBlk*)&pPage->u.aDisk[idx];
559 pNew->iSize = SWAB16(pBt, size + SWAB16(pBt, pNext->iSize));
560 pNew->iNext = pNext->iNext;
561 }
562 *pIdx = SWAB16(pBt, start);
563 pPage->nFree += size;
564}
565
566/*
567** Initialize the auxiliary information for a disk block.
568**
569** The pParent parameter must be a pointer to the MemPage which
570** is the parent of the page being initialized. The root of the
571** BTree (usually page 2) has no parent and so for that page,
572** pParent==NULL.
573**
574** Return SQLITE_OK on success. If we see that the page does
575** not contain a well-formed database page, then return
576** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
577** guarantee that the page is well-formed. It only shows that
578** we failed to detect any corruption.
579*/
580static int initPage(Bt *pBt, MemPage *pPage, Pgno pgnoThis, MemPage *pParent){
581 int idx; /* An index into pPage->u.aDisk[] */
582 Cell *pCell; /* A pointer to a Cell in pPage->u.aDisk[] */
583 FreeBlk *pFBlk; /* A pointer to a free block in pPage->u.aDisk[] */
584 int sz; /* The size of a Cell in bytes */
585 int freeSpace; /* Amount of free space on the page */
586
587 if( pPage->pParent ){
588 assert( pPage->pParent==pParent );
589 return SQLITE_OK;
590 }
591 if( pParent ){
592 pPage->pParent = pParent;
593 sqlitepager_ref(pParent);
594 }
595 if( pPage->isInit ) return SQLITE_OK;
596 pPage->isInit = 1;
597 pPage->nCell = 0;
598 freeSpace = USABLE_SPACE;
599 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
600 while( idx!=0 ){
601 if( idx>SQLITE_USABLE_SIZE-MIN_CELL_SIZE ) goto page_format_error;
602 if( idx<sizeof(PageHdr) ) goto page_format_error;
603 if( idx!=ROUNDUP(idx) ) goto page_format_error;
604 pCell = (Cell*)&pPage->u.aDisk[idx];
605 sz = cellSize(pBt, pCell);
606 if( idx+sz > SQLITE_USABLE_SIZE ) goto page_format_error;
607 freeSpace -= sz;
608 pPage->apCell[pPage->nCell++] = pCell;
609 idx = SWAB16(pBt, pCell->h.iNext);
610 }
611 pPage->nFree = 0;
612 idx = SWAB16(pBt, pPage->u.hdr.firstFree);
613 while( idx!=0 ){
614 int iNext;
615 if( idx>SQLITE_USABLE_SIZE-sizeof(FreeBlk) ) goto page_format_error;
616 if( idx<sizeof(PageHdr) ) goto page_format_error;
617 pFBlk = (FreeBlk*)&pPage->u.aDisk[idx];
618 pPage->nFree += SWAB16(pBt, pFBlk->iSize);
619 iNext = SWAB16(pBt, pFBlk->iNext);
620 if( iNext>0 && iNext <= idx ) goto page_format_error;
621 idx = iNext;
622 }
623 if( pPage->nCell==0 && pPage->nFree==0 ){
624 /* As a special case, an uninitialized root page appears to be
625 ** an empty database */
626 return SQLITE_OK;
627 }
628 if( pPage->nFree!=freeSpace ) goto page_format_error;
629 return SQLITE_OK;
630
631page_format_error:
632 return SQLITE_CORRUPT;
633}
634
635/*
636** Set up a raw page so that it looks like a database page holding
637** no entries.
638*/
639static void zeroPage(Btree *pBt, MemPage *pPage){
640 PageHdr *pHdr;
641 FreeBlk *pFBlk;
642 assert( sqlitepager_iswriteable(pPage) );
643 memset(pPage, 0, SQLITE_USABLE_SIZE);
644 pHdr = &pPage->u.hdr;
645 pHdr->firstCell = 0;
646 pHdr->firstFree = SWAB16(pBt, sizeof(*pHdr));
647 pFBlk = (FreeBlk*)&pHdr[1];
648 pFBlk->iNext = 0;
649 pPage->nFree = SQLITE_USABLE_SIZE - sizeof(*pHdr);
650 pFBlk->iSize = SWAB16(pBt, pPage->nFree);
651 pPage->nCell = 0;
652 pPage->isOverfull = 0;
653}
654
655/*
656** This routine is called when the reference count for a page
657** reaches zero. We need to unref the pParent pointer when that
658** happens.
659*/
660static void pageDestructor(void *pData){
661 MemPage *pPage = (MemPage*)pData;
662 if( pPage->pParent ){
663 MemPage *pParent = pPage->pParent;
664 pPage->pParent = 0;
665 sqlitepager_unref(pParent);
666 }
667}
668
669/*
670** Open a new database.
671**
672** Actually, this routine just sets up the internal data structures
673** for accessing the database. We do not open the database file
674** until the first page is loaded.
675**
676** zFilename is the name of the database file. If zFilename is NULL
677** a new database with a random name is created. This randomly named
678** database file will be deleted when sqliteBtreeClose() is called.
679*/
680int sqliteBtreeOpen(
681 const char *zFilename, /* Name of the file containing the BTree database */
682 int omitJournal, /* if TRUE then do not journal this file */
683 int nCache, /* How many pages in the page cache */
684 Btree **ppBtree /* Pointer to new Btree object written here */
685){
686 Btree *pBt;
687 int rc;
688
689 /*
690 ** The following asserts make sure that structures used by the btree are
691 ** the right size. This is to guard against size changes that result
692 ** when compiling on a different architecture.
693 */
694 assert( sizeof(u32)==4 );
695 assert( sizeof(u16)==2 );
696 assert( sizeof(Pgno)==4 );
697 assert( sizeof(PageHdr)==8 );
698 assert( sizeof(CellHdr)==12 );
699 assert( sizeof(FreeBlk)==4 );
700 assert( sizeof(OverflowPage)==SQLITE_USABLE_SIZE );
701 assert( sizeof(FreelistInfo)==OVERFLOW_SIZE );
702 assert( sizeof(ptr)==sizeof(char*) );
703 assert( sizeof(uptr)==sizeof(ptr) );
704
705 pBt = sqliteMalloc( sizeof(*pBt) );
706 if( pBt==0 ){
707 *ppBtree = 0;
708 return SQLITE_NOMEM;
709 }
710 if( nCache<10 ) nCache = 10;
711 rc = sqlitepager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
712 !omitJournal);
713 if( rc!=SQLITE_OK ){
714 if( pBt->pPager ) sqlitepager_close(pBt->pPager);
715 sqliteFree(pBt);
716 *ppBtree = 0;
717 return rc;
718 }
719 sqlitepager_set_destructor(pBt->pPager, pageDestructor);
720 pBt->pCursor = 0;
721 pBt->page1 = 0;
722 pBt->readOnly = sqlitepager_isreadonly(pBt->pPager);
723 pBt->pOps = &sqliteBtreeOps;
724 *ppBtree = pBt;
725 return SQLITE_OK;
726}
727
728/*
729** Close an open database and invalidate all cursors.
730*/
731static int fileBtreeClose(Btree *pBt){
732 while( pBt->pCursor ){
733 fileBtreeCloseCursor(pBt->pCursor);
734 }
735 sqlitepager_close(pBt->pPager);
736 sqliteFree(pBt);
737 return SQLITE_OK;
738}
739
740/*
741** Change the limit on the number of pages allowed in the cache.
742**
743** The maximum number of cache pages is set to the absolute
744** value of mxPage. If mxPage is negative, the pager will
745** operate asynchronously - it will not stop to do fsync()s
746** to insure data is written to the disk surface before
747** continuing. Transactions still work if synchronous is off,
748** and the database cannot be corrupted if this program
749** crashes. But if the operating system crashes or there is
750** an abrupt power failure when synchronous is off, the database
751** could be left in an inconsistent and unrecoverable state.
752** Synchronous is on by default so database corruption is not
753** normally a worry.
754*/
755static int fileBtreeSetCacheSize(Btree *pBt, int mxPage){
756 sqlitepager_set_cachesize(pBt->pPager, mxPage);
757 return SQLITE_OK;
758}
759
760/*
761** Change the way data is synced to disk in order to increase or decrease
762** how well the database resists damage due to OS crashes and power
763** failures. Level 1 is the same as asynchronous (no syncs() occur and
764** there is a high probability of damage) Level 2 is the default. There
765** is a very low but non-zero probability of damage. Level 3 reduces the
766** probability of damage to near zero but with a write performance reduction.
767*/
768static int fileBtreeSetSafetyLevel(Btree *pBt, int level){
769 sqlitepager_set_safety_level(pBt->pPager, level);
770 return SQLITE_OK;
771}
772
773/*
774** Get a reference to page1 of the database file. This will
775** also acquire a readlock on that file.
776**
777** SQLITE_OK is returned on success. If the file is not a
778** well-formed database file, then SQLITE_CORRUPT is returned.
779** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
780** is returned if we run out of memory. SQLITE_PROTOCOL is returned
781** if there is a locking protocol violation.
782*/
783static int lockBtree(Btree *pBt){
784 int rc;
785 if( pBt->page1 ) return SQLITE_OK;
786 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pBt->page1);
787 if( rc!=SQLITE_OK ) return rc;
788
789 /* Do some checking to help insure the file we opened really is
790 ** a valid database file.
791 */
792 if( sqlitepager_pagecount(pBt->pPager)>0 ){
793 PageOne *pP1 = pBt->page1;
794 if( strcmp(pP1->zMagic,zMagicHeader)!=0 ||
795 (pP1->iMagic!=MAGIC && swab32(pP1->iMagic)!=MAGIC) ){
796 rc = SQLITE_NOTADB;
797 goto page1_init_failed;
798 }
799 pBt->needSwab = pP1->iMagic!=MAGIC;
800 }
801 return rc;
802
803page1_init_failed:
804 sqlitepager_unref(pBt->page1);
805 pBt->page1 = 0;
806 return rc;
807}
808
809/*
810** If there are no outstanding cursors and we are not in the middle
811** of a transaction but there is a read lock on the database, then
812** this routine unrefs the first page of the database file which
813** has the effect of releasing the read lock.
814**
815** If there are any outstanding cursors, this routine is a no-op.
816**
817** If there is a transaction in progress, this routine is a no-op.
818*/
819static void unlockBtreeIfUnused(Btree *pBt){
820 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->page1!=0 ){
821 sqlitepager_unref(pBt->page1);
822 pBt->page1 = 0;
823 pBt->inTrans = 0;
824 pBt->inCkpt = 0;
825 }
826}
827
828/*
829** Create a new database by initializing the first two pages of the
830** file.
831*/
832static int newDatabase(Btree *pBt){
833 MemPage *pRoot;
834 PageOne *pP1;
835 int rc;
836 if( sqlitepager_pagecount(pBt->pPager)>1 ) return SQLITE_OK;
837 pP1 = pBt->page1;
838 rc = sqlitepager_write(pBt->page1);
839 if( rc ) return rc;
840 rc = sqlitepager_get(pBt->pPager, 2, (void**)&pRoot);
841 if( rc ) return rc;
842 rc = sqlitepager_write(pRoot);
843 if( rc ){
844 sqlitepager_unref(pRoot);
845 return rc;
846 }
847 strcpy(pP1->zMagic, zMagicHeader);
848 if( btree_native_byte_order ){
849 pP1->iMagic = MAGIC;
850 pBt->needSwab = 0;
851 }else{
852 pP1->iMagic = swab32(MAGIC);
853 pBt->needSwab = 1;
854 }
855 zeroPage(pBt, pRoot);
856 sqlitepager_unref(pRoot);
857 return SQLITE_OK;
858}
859
860/*
861** Attempt to start a new transaction.
862**
863** A transaction must be started before attempting any changes
864** to the database. None of the following routines will work
865** unless a transaction is started first:
866**
867** sqliteBtreeCreateTable()
868** sqliteBtreeCreateIndex()
869** sqliteBtreeClearTable()
870** sqliteBtreeDropTable()
871** sqliteBtreeInsert()
872** sqliteBtreeDelete()
873** sqliteBtreeUpdateMeta()
874*/
875static int fileBtreeBeginTrans(Btree *pBt){
876 int rc;
877 if( pBt->inTrans ) return SQLITE_ERROR;
878 if( pBt->readOnly ) return SQLITE_READONLY;
879 if( pBt->page1==0 ){
880 rc = lockBtree(pBt);
881 if( rc!=SQLITE_OK ){
882 return rc;
883 }
884 }
885 rc = sqlitepager_begin(pBt->page1);
886 if( rc==SQLITE_OK ){
887 rc = newDatabase(pBt);
888 }
889 if( rc==SQLITE_OK ){
890 pBt->inTrans = 1;
891 pBt->inCkpt = 0;
892 }else{
893 unlockBtreeIfUnused(pBt);
894 }
895 return rc;
896}
897
898/*
899** Commit the transaction currently in progress.
900**
901** This will release the write lock on the database file. If there
902** are no active cursors, it also releases the read lock.
903*/
904static int fileBtreeCommit(Btree *pBt){
905 int rc;
906 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_commit(pBt->pPager);
907 pBt->inTrans = 0;
908 pBt->inCkpt = 0;
909 unlockBtreeIfUnused(pBt);
910 return rc;
911}
912
913/*
914** Rollback the transaction in progress. All cursors will be
915** invalided by this operation. Any attempt to use a cursor
916** that was open at the beginning of this operation will result
917** in an error.
918**
919** This will release the write lock on the database file. If there
920** are no active cursors, it also releases the read lock.
921*/
922static int fileBtreeRollback(Btree *pBt){
923 int rc;
924 BtCursor *pCur;
925 if( pBt->inTrans==0 ) return SQLITE_OK;
926 pBt->inTrans = 0;
927 pBt->inCkpt = 0;
928 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_rollback(pBt->pPager);
929 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
930 if( pCur->pPage && pCur->pPage->isInit==0 ){
931 sqlitepager_unref(pCur->pPage);
932 pCur->pPage = 0;
933 }
934 }
935 unlockBtreeIfUnused(pBt);
936 return rc;
937}
938
939/*
940** Set the checkpoint for the current transaction. The checkpoint serves
941** as a sub-transaction that can be rolled back independently of the
942** main transaction. You must start a transaction before starting a
943** checkpoint. The checkpoint is ended automatically if the transaction
944** commits or rolls back.
945**
946** Only one checkpoint may be active at a time. It is an error to try
947** to start a new checkpoint if another checkpoint is already active.
948*/
949static int fileBtreeBeginCkpt(Btree *pBt){
950 int rc;
951 if( !pBt->inTrans || pBt->inCkpt ){
952 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
953 }
954 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_ckpt_begin(pBt->pPager);
955 pBt->inCkpt = 1;
956 return rc;
957}
958
959
960/*
961** Commit a checkpoint to transaction currently in progress. If no
962** checkpoint is active, this is a no-op.
963*/
964static int fileBtreeCommitCkpt(Btree *pBt){
965 int rc;
966 if( pBt->inCkpt && !pBt->readOnly ){
967 rc = sqlitepager_ckpt_commit(pBt->pPager);
968 }else{
969 rc = SQLITE_OK;
970 }
971 pBt->inCkpt = 0;
972 return rc;
973}
974
975/*
976** Rollback the checkpoint to the current transaction. If there
977** is no active checkpoint or transaction, this routine is a no-op.
978**
979** All cursors will be invalided by this operation. Any attempt
980** to use a cursor that was open at the beginning of this operation
981** will result in an error.
982*/
983static int fileBtreeRollbackCkpt(Btree *pBt){
984 int rc;
985 BtCursor *pCur;
986 if( pBt->inCkpt==0 || pBt->readOnly ) return SQLITE_OK;
987 rc = sqlitepager_ckpt_rollback(pBt->pPager);
988 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
989 if( pCur->pPage && pCur->pPage->isInit==0 ){
990 sqlitepager_unref(pCur->pPage);
991 pCur->pPage = 0;
992 }
993 }
994 pBt->inCkpt = 0;
995 return rc;
996}
997
998/*
999** Create a new cursor for the BTree whose root is on the page
1000** iTable. The act of acquiring a cursor gets a read lock on
1001** the database file.
1002**
1003** If wrFlag==0, then the cursor can only be used for reading.
1004** If wrFlag==1, then the cursor can be used for reading or for
1005** writing if other conditions for writing are also met. These
1006** are the conditions that must be met in order for writing to
1007** be allowed:
1008**
1009** 1: The cursor must have been opened with wrFlag==1
1010**
1011** 2: No other cursors may be open with wrFlag==0 on the same table
1012**
1013** 3: The database must be writable (not on read-only media)
1014**
1015** 4: There must be an active transaction.
1016**
1017** Condition 2 warrants further discussion. If any cursor is opened
1018** on a table with wrFlag==0, that prevents all other cursors from
1019** writing to that table. This is a kind of "read-lock". When a cursor
1020** is opened with wrFlag==0 it is guaranteed that the table will not
1021** change as long as the cursor is open. This allows the cursor to
1022** do a sequential scan of the table without having to worry about
1023** entries being inserted or deleted during the scan. Cursors should
1024** be opened with wrFlag==0 only if this read-lock property is needed.
1025** That is to say, cursors should be opened with wrFlag==0 only if they
1026** intend to use the sqliteBtreeNext() system call. All other cursors
1027** should be opened with wrFlag==1 even if they never really intend
1028** to write.
1029**
1030** No checking is done to make sure that page iTable really is the
1031** root page of a b-tree. If it is not, then the cursor acquired
1032** will not work correctly.
1033*/
1034static
1035int fileBtreeCursor(Btree *pBt, int iTable, int wrFlag, BtCursor **ppCur){
1036 int rc;
1037 BtCursor *pCur, *pRing;
1038
1039 if( pBt->readOnly && wrFlag ){
1040 *ppCur = 0;
1041 return SQLITE_READONLY;
1042 }
1043 if( pBt->page1==0 ){
1044 rc = lockBtree(pBt);
1045 if( rc!=SQLITE_OK ){
1046 *ppCur = 0;
1047 return rc;
1048 }
1049 }
1050 pCur = sqliteMalloc( sizeof(*pCur) );
1051 if( pCur==0 ){
1052 rc = SQLITE_NOMEM;
1053 goto create_cursor_exception;
1054 }
1055 pCur->pgnoRoot = (Pgno)iTable;
1056 rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pCur->pPage);
1057 if( rc!=SQLITE_OK ){
1058 goto create_cursor_exception;
1059 }
1060 rc = initPage(pBt, pCur->pPage, pCur->pgnoRoot, 0);
1061 if( rc!=SQLITE_OK ){
1062 goto create_cursor_exception;
1063 }
1064 pCur->pOps = &sqliteBtreeCursorOps;
1065 pCur->pBt = pBt;
1066 pCur->wrFlag = wrFlag;
1067 pCur->idx = 0;
1068 pCur->eSkip = SKIP_INVALID;
1069 pCur->pNext = pBt->pCursor;
1070 if( pCur->pNext ){
1071 pCur->pNext->pPrev = pCur;
1072 }
1073 pCur->pPrev = 0;
1074 pRing = pBt->pCursor;
1075 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1076 if( pRing ){
1077 pCur->pShared = pRing->pShared;
1078 pRing->pShared = pCur;
1079 }else{
1080 pCur->pShared = pCur;
1081 }
1082 pBt->pCursor = pCur;
1083 *ppCur = pCur;
1084 return SQLITE_OK;
1085
1086create_cursor_exception:
1087 *ppCur = 0;
1088 if( pCur ){
1089 if( pCur->pPage ) sqlitepager_unref(pCur->pPage);
1090 sqliteFree(pCur);
1091 }
1092 unlockBtreeIfUnused(pBt);
1093 return rc;
1094}
1095
1096/*
1097** Close a cursor. The read lock on the database file is released
1098** when the last cursor is closed.
1099*/
1100static int fileBtreeCloseCursor(BtCursor *pCur){
1101 Btree *pBt = pCur->pBt;
1102 if( pCur->pPrev ){
1103 pCur->pPrev->pNext = pCur->pNext;
1104 }else{
1105 pBt->pCursor = pCur->pNext;
1106 }
1107 if( pCur->pNext ){
1108 pCur->pNext->pPrev = pCur->pPrev;
1109 }
1110 if( pCur->pPage ){
1111 sqlitepager_unref(pCur->pPage);
1112 }
1113 if( pCur->pShared!=pCur ){
1114 BtCursor *pRing = pCur->pShared;
1115 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1116 pRing->pShared = pCur->pShared;
1117 }
1118 unlockBtreeIfUnused(pBt);
1119 sqliteFree(pCur);
1120 return SQLITE_OK;
1121}
1122
1123/*
1124** Make a temporary cursor by filling in the fields of pTempCur.
1125** The temporary cursor is not on the cursor list for the Btree.
1126*/
1127static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
1128 memcpy(pTempCur, pCur, sizeof(*pCur));
1129 pTempCur->pNext = 0;
1130 pTempCur->pPrev = 0;
1131 if( pTempCur->pPage ){
1132 sqlitepager_ref(pTempCur->pPage);
1133 }
1134}
1135
1136/*
1137** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
1138** function above.
1139*/
1140static void releaseTempCursor(BtCursor *pCur){
1141 if( pCur->pPage ){
1142 sqlitepager_unref(pCur->pPage);
1143 }
1144}
1145
1146/*
1147** Set *pSize to the number of bytes of key in the entry the
1148** cursor currently points to. Always return SQLITE_OK.
1149** Failure is not possible. If the cursor is not currently
1150** pointing to an entry (which can happen, for example, if
1151** the database is empty) then *pSize is set to 0.
1152*/
1153static int fileBtreeKeySize(BtCursor *pCur, int *pSize){
1154 Cell *pCell;
1155 MemPage *pPage;
1156
1157 pPage = pCur->pPage;
1158 assert( pPage!=0 );
1159 if( pCur->idx >= pPage->nCell ){
1160 *pSize = 0;
1161 }else{
1162 pCell = pPage->apCell[pCur->idx];
1163 *pSize = NKEY(pCur->pBt, pCell->h);
1164 }
1165 return SQLITE_OK;
1166}
1167
1168/*
1169** Read payload information from the entry that the pCur cursor is
1170** pointing to. Begin reading the payload at "offset" and read
1171** a total of "amt" bytes. Put the result in zBuf.
1172**
1173** This routine does not make a distinction between key and data.
1174** It just reads bytes from the payload area.
1175*/
1176static int getPayload(BtCursor *pCur, int offset, int amt, char *zBuf){
1177 char *aPayload;
1178 Pgno nextPage;
1179 int rc;
1180 Btree *pBt = pCur->pBt;
1181 assert( pCur!=0 && pCur->pPage!=0 );
1182 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1183 aPayload = pCur->pPage->apCell[pCur->idx]->aPayload;
1184 if( offset<MX_LOCAL_PAYLOAD ){
1185 int a = amt;
1186 if( a+offset>MX_LOCAL_PAYLOAD ){
1187 a = MX_LOCAL_PAYLOAD - offset;
1188 }
1189 memcpy(zBuf, &aPayload[offset], a);
1190 if( a==amt ){
1191 return SQLITE_OK;
1192 }
1193 offset = 0;
1194 zBuf += a;
1195 amt -= a;
1196 }else{
1197 offset -= MX_LOCAL_PAYLOAD;
1198 }
1199 if( amt>0 ){
1200 nextPage = SWAB32(pBt, pCur->pPage->apCell[pCur->idx]->ovfl);
1201 }
1202 while( amt>0 && nextPage ){
1203 OverflowPage *pOvfl;
1204 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl);
1205 if( rc!=0 ){
1206 return rc;
1207 }
1208 nextPage = SWAB32(pBt, pOvfl->iNext);
1209 if( offset<OVERFLOW_SIZE ){
1210 int a = amt;
1211 if( a + offset > OVERFLOW_SIZE ){
1212 a = OVERFLOW_SIZE - offset;
1213 }
1214 memcpy(zBuf, &pOvfl->aPayload[offset], a);
1215 offset = 0;
1216 amt -= a;
1217 zBuf += a;
1218 }else{
1219 offset -= OVERFLOW_SIZE;
1220 }
1221 sqlitepager_unref(pOvfl);
1222 }
1223 if( amt>0 ){
1224 return SQLITE_CORRUPT;
1225 }
1226 return SQLITE_OK;
1227}
1228
1229/*
1230** Read part of the key associated with cursor pCur. A maximum
1231** of "amt" bytes will be transfered into zBuf[]. The transfer
1232** begins at "offset". The number of bytes actually read is
1233** returned.
1234**
1235** Change: It used to be that the amount returned will be smaller
1236** than the amount requested if there are not enough bytes in the key
1237** to satisfy the request. But now, it must be the case that there
1238** is enough data available to satisfy the request. If not, an exception
1239** is raised. The change was made in an effort to boost performance
1240** by eliminating unneeded tests.
1241*/
1242static int fileBtreeKey(BtCursor *pCur, int offset, int amt, char *zBuf){
1243 MemPage *pPage;
1244
1245 assert( amt>=0 );
1246 assert( offset>=0 );
1247 assert( pCur->pPage!=0 );
1248 pPage = pCur->pPage;
1249 if( pCur->idx >= pPage->nCell ){
1250 return 0;
1251 }
1252 assert( amt+offset <= NKEY(pCur->pBt, pPage->apCell[pCur->idx]->h) );
1253 getPayload(pCur, offset, amt, zBuf);
1254 return amt;
1255}
1256
1257/*
1258** Set *pSize to the number of bytes of data in the entry the
1259** cursor currently points to. Always return SQLITE_OK.
1260** Failure is not possible. If the cursor is not currently
1261** pointing to an entry (which can happen, for example, if
1262** the database is empty) then *pSize is set to 0.
1263*/
1264static int fileBtreeDataSize(BtCursor *pCur, int *pSize){
1265 Cell *pCell;
1266 MemPage *pPage;
1267
1268 pPage = pCur->pPage;
1269 assert( pPage!=0 );
1270 if( pCur->idx >= pPage->nCell ){
1271 *pSize = 0;
1272 }else{
1273 pCell = pPage->apCell[pCur->idx];
1274 *pSize = NDATA(pCur->pBt, pCell->h);
1275 }
1276 return SQLITE_OK;
1277}
1278
1279/*
1280** Read part of the data associated with cursor pCur. A maximum
1281** of "amt" bytes will be transfered into zBuf[]. The transfer
1282** begins at "offset". The number of bytes actually read is
1283** returned. The amount returned will be smaller than the
1284** amount requested if there are not enough bytes in the data
1285** to satisfy the request.
1286*/
1287static int fileBtreeData(BtCursor *pCur, int offset, int amt, char *zBuf){
1288 Cell *pCell;
1289 MemPage *pPage;
1290
1291 assert( amt>=0 );
1292 assert( offset>=0 );
1293 assert( pCur->pPage!=0 );
1294 pPage = pCur->pPage;
1295 if( pCur->idx >= pPage->nCell ){
1296 return 0;
1297 }
1298 pCell = pPage->apCell[pCur->idx];
1299 assert( amt+offset <= NDATA(pCur->pBt, pCell->h) );
1300 getPayload(pCur, offset + NKEY(pCur->pBt, pCell->h), amt, zBuf);
1301 return amt;
1302}
1303
1304/*
1305** Compare an external key against the key on the entry that pCur points to.
1306**
1307** The external key is pKey and is nKey bytes long. The last nIgnore bytes
1308** of the key associated with pCur are ignored, as if they do not exist.
1309** (The normal case is for nIgnore to be zero in which case the entire
1310** internal key is used in the comparison.)
1311**
1312** The comparison result is written to *pRes as follows:
1313**
1314** *pRes<0 This means pCur<pKey
1315**
1316** *pRes==0 This means pCur==pKey for all nKey bytes
1317**
1318** *pRes>0 This means pCur>pKey
1319**
1320** When one key is an exact prefix of the other, the shorter key is
1321** considered less than the longer one. In order to be equal the
1322** keys must be exactly the same length. (The length of the pCur key
1323** is the actual key length minus nIgnore bytes.)
1324*/
1325static int fileBtreeKeyCompare(
1326 BtCursor *pCur, /* Pointer to entry to compare against */
1327 const void *pKey, /* Key to compare against entry that pCur points to */
1328 int nKey, /* Number of bytes in pKey */
1329 int nIgnore, /* Ignore this many bytes at the end of pCur */
1330 int *pResult /* Write the result here */
1331){
1332 Pgno nextPage;
1333 int n, c, rc, nLocal;
1334 Cell *pCell;
1335 Btree *pBt = pCur->pBt;
1336 const char *zKey = (const char*)pKey;
1337
1338 assert( pCur->pPage );
1339 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1340 pCell = pCur->pPage->apCell[pCur->idx];
1341 nLocal = NKEY(pBt, pCell->h) - nIgnore;
1342 if( nLocal<0 ) nLocal = 0;
1343 n = nKey<nLocal ? nKey : nLocal;
1344 if( n>MX_LOCAL_PAYLOAD ){
1345 n = MX_LOCAL_PAYLOAD;
1346 }
1347 c = memcmp(pCell->aPayload, zKey, n);
1348 if( c!=0 ){
1349 *pResult = c;
1350 return SQLITE_OK;
1351 }
1352 zKey += n;
1353 nKey -= n;
1354 nLocal -= n;
1355 nextPage = SWAB32(pBt, pCell->ovfl);
1356 while( nKey>0 && nLocal>0 ){
1357 OverflowPage *pOvfl;
1358 if( nextPage==0 ){
1359 return SQLITE_CORRUPT;
1360 }
1361 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl);
1362 if( rc ){
1363 return rc;
1364 }
1365 nextPage = SWAB32(pBt, pOvfl->iNext);
1366 n = nKey<nLocal ? nKey : nLocal;
1367 if( n>OVERFLOW_SIZE ){
1368 n = OVERFLOW_SIZE;
1369 }
1370 c = memcmp(pOvfl->aPayload, zKey, n);
1371 sqlitepager_unref(pOvfl);
1372 if( c!=0 ){
1373 *pResult = c;
1374 return SQLITE_OK;
1375 }
1376 nKey -= n;
1377 nLocal -= n;
1378 zKey += n;
1379 }
1380 if( c==0 ){
1381 c = nLocal - nKey;
1382 }
1383 *pResult = c;
1384 return SQLITE_OK;
1385}
1386
1387/*
1388** Move the cursor down to a new child page. The newPgno argument is the
1389** page number of the child page in the byte order of the disk image.
1390*/
1391static int moveToChild(BtCursor *pCur, int newPgno){
1392 int rc;
1393 MemPage *pNewPage;
1394 Btree *pBt = pCur->pBt;
1395
1396 newPgno = SWAB32(pBt, newPgno);
1397 rc = sqlitepager_get(pBt->pPager, newPgno, (void**)&pNewPage);
1398 if( rc ) return rc;
1399 rc = initPage(pBt, pNewPage, newPgno, pCur->pPage);
1400 if( rc ) return rc;
1401 assert( pCur->idx>=pCur->pPage->nCell
1402 || pCur->pPage->apCell[pCur->idx]->h.leftChild==SWAB32(pBt,newPgno) );
1403 assert( pCur->idx<pCur->pPage->nCell
1404 || pCur->pPage->u.hdr.rightChild==SWAB32(pBt,newPgno) );
1405 pNewPage->idxParent = pCur->idx;
1406 pCur->pPage->idxShift = 0;
1407 sqlitepager_unref(pCur->pPage);
1408 pCur->pPage = pNewPage;
1409 pCur->idx = 0;
1410 if( pNewPage->nCell<1 ){
1411 return SQLITE_CORRUPT;
1412 }
1413 return SQLITE_OK;
1414}
1415
1416/*
1417** Move the cursor up to the parent page.
1418**
1419** pCur->idx is set to the cell index that contains the pointer
1420** to the page we are coming from. If we are coming from the
1421** right-most child page then pCur->idx is set to one more than
1422** the largest cell index.
1423*/
1424static void moveToParent(BtCursor *pCur){
1425 Pgno oldPgno;
1426 MemPage *pParent;
1427 MemPage *pPage;
1428 int idxParent;
1429 pPage = pCur->pPage;
1430 assert( pPage!=0 );
1431 pParent = pPage->pParent;
1432 assert( pParent!=0 );
1433 idxParent = pPage->idxParent;
1434 sqlitepager_ref(pParent);
1435 sqlitepager_unref(pPage);
1436 pCur->pPage = pParent;
1437 assert( pParent->idxShift==0 );
1438 if( pParent->idxShift==0 ){
1439 pCur->idx = idxParent;
1440#ifndef NDEBUG
1441 /* Verify that pCur->idx is the correct index to point back to the child
1442 ** page we just came from
1443 */
1444 oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage));
1445 if( pCur->idx<pParent->nCell ){
1446 assert( pParent->apCell[idxParent]->h.leftChild==oldPgno );
1447 }else{
1448 assert( pParent->u.hdr.rightChild==oldPgno );
1449 }
1450#endif
1451 }else{
1452 /* The MemPage.idxShift flag indicates that cell indices might have
1453 ** changed since idxParent was set and hence idxParent might be out
1454 ** of date. So recompute the parent cell index by scanning all cells
1455 ** and locating the one that points to the child we just came from.
1456 */
1457 int i;
1458 pCur->idx = pParent->nCell;
1459 oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage));
1460 for(i=0; i<pParent->nCell; i++){
1461 if( pParent->apCell[i]->h.leftChild==oldPgno ){
1462 pCur->idx = i;
1463 break;
1464 }
1465 }
1466 }
1467}
1468
1469/*
1470** Move the cursor to the root page
1471*/
1472static int moveToRoot(BtCursor *pCur){
1473 MemPage *pNew;
1474 int rc;
1475 Btree *pBt = pCur->pBt;
1476
1477 rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pNew);
1478 if( rc ) return rc;
1479 rc = initPage(pBt, pNew, pCur->pgnoRoot, 0);
1480 if( rc ) return rc;
1481 sqlitepager_unref(pCur->pPage);
1482 pCur->pPage = pNew;
1483 pCur->idx = 0;
1484 return SQLITE_OK;
1485}
1486
1487/*
1488** Move the cursor down to the left-most leaf entry beneath the
1489** entry to which it is currently pointing.
1490*/
1491static int moveToLeftmost(BtCursor *pCur){
1492 Pgno pgno;
1493 int rc;
1494
1495 while( (pgno = pCur->pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
1496 rc = moveToChild(pCur, pgno);
1497 if( rc ) return rc;
1498 }
1499 return SQLITE_OK;
1500}
1501
1502/*
1503** Move the cursor down to the right-most leaf entry beneath the
1504** page to which it is currently pointing. Notice the difference
1505** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1506** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1507** finds the right-most entry beneath the *page*.
1508*/
1509static int moveToRightmost(BtCursor *pCur){
1510 Pgno pgno;
1511 int rc;
1512
1513 while( (pgno = pCur->pPage->u.hdr.rightChild)!=0 ){
1514 pCur->idx = pCur->pPage->nCell;
1515 rc = moveToChild(pCur, pgno);
1516 if( rc ) return rc;
1517 }
1518 pCur->idx = pCur->pPage->nCell - 1;
1519 return SQLITE_OK;
1520}
1521
1522/* Move the cursor to the first entry in the table. Return SQLITE_OK
1523** on success. Set *pRes to 0 if the cursor actually points to something
1524** or set *pRes to 1 if the table is empty.
1525*/
1526static int fileBtreeFirst(BtCursor *pCur, int *pRes){
1527 int rc;
1528 if( pCur->pPage==0 ) return SQLITE_ABORT;
1529 rc = moveToRoot(pCur);
1530 if( rc ) return rc;
1531 if( pCur->pPage->nCell==0 ){
1532 *pRes = 1;
1533 return SQLITE_OK;
1534 }
1535 *pRes = 0;
1536 rc = moveToLeftmost(pCur);
1537 pCur->eSkip = SKIP_NONE;
1538 return rc;
1539}
1540
1541/* Move the cursor to the last entry in the table. Return SQLITE_OK
1542** on success. Set *pRes to 0 if the cursor actually points to something
1543** or set *pRes to 1 if the table is empty.
1544*/
1545static int fileBtreeLast(BtCursor *pCur, int *pRes){
1546 int rc;
1547 if( pCur->pPage==0 ) return SQLITE_ABORT;
1548 rc = moveToRoot(pCur);
1549 if( rc ) return rc;
1550 assert( pCur->pPage->isInit );
1551 if( pCur->pPage->nCell==0 ){
1552 *pRes = 1;
1553 return SQLITE_OK;
1554 }
1555 *pRes = 0;
1556 rc = moveToRightmost(pCur);
1557 pCur->eSkip = SKIP_NONE;
1558 return rc;
1559}
1560
1561/* Move the cursor so that it points to an entry near pKey.
1562** Return a success code.
1563**
1564** If an exact match is not found, then the cursor is always
1565** left pointing at a leaf page which would hold the entry if it
1566** were present. The cursor might point to an entry that comes
1567** before or after the key.
1568**
1569** The result of comparing the key with the entry to which the
1570** cursor is left pointing is stored in pCur->iMatch. The same
1571** value is also written to *pRes if pRes!=NULL. The meaning of
1572** this value is as follows:
1573**
1574** *pRes<0 The cursor is left pointing at an entry that
1575** is smaller than pKey or if the table is empty
1576** and the cursor is therefore left point to nothing.
1577**
1578** *pRes==0 The cursor is left pointing at an entry that
1579** exactly matches pKey.
1580**
1581** *pRes>0 The cursor is left pointing at an entry that
1582** is larger than pKey.
1583*/
1584static
1585int fileBtreeMoveto(BtCursor *pCur, const void *pKey, int nKey, int *pRes){
1586 int rc;
1587 if( pCur->pPage==0 ) return SQLITE_ABORT;
1588 pCur->eSkip = SKIP_NONE;
1589 rc = moveToRoot(pCur);
1590 if( rc ) return rc;
1591 for(;;){
1592 int lwr, upr;
1593 Pgno chldPg;
1594 MemPage *pPage = pCur->pPage;
1595 int c = -1; /* pRes return if table is empty must be -1 */
1596 lwr = 0;
1597 upr = pPage->nCell-1;
1598 while( lwr<=upr ){
1599 pCur->idx = (lwr+upr)/2;
1600 rc = fileBtreeKeyCompare(pCur, pKey, nKey, 0, &c);
1601 if( rc ) return rc;
1602 if( c==0 ){
1603 pCur->iMatch = c;
1604 if( pRes ) *pRes = 0;
1605 return SQLITE_OK;
1606 }
1607 if( c<0 ){
1608 lwr = pCur->idx+1;
1609 }else{
1610 upr = pCur->idx-1;
1611 }
1612 }
1613 assert( lwr==upr+1 );
1614 assert( pPage->isInit );
1615 if( lwr>=pPage->nCell ){
1616 chldPg = pPage->u.hdr.rightChild;
1617 }else{
1618 chldPg = pPage->apCell[lwr]->h.leftChild;
1619 }
1620 if( chldPg==0 ){
1621 pCur->iMatch = c;
1622 if( pRes ) *pRes = c;
1623 return SQLITE_OK;
1624 }
1625 pCur->idx = lwr;
1626 rc = moveToChild(pCur, chldPg);
1627 if( rc ) return rc;
1628 }
1629 /* NOT REACHED */
1630}
1631
1632/*
1633** Advance the cursor to the next entry in the database. If
1634** successful then set *pRes=0. If the cursor
1635** was already pointing to the last entry in the database before
1636** this routine was called, then set *pRes=1.
1637*/
1638static int fileBtreeNext(BtCursor *pCur, int *pRes){
1639 int rc;
1640 MemPage *pPage = pCur->pPage;
1641 assert( pRes!=0 );
1642 if( pPage==0 ){
1643 *pRes = 1;
1644 return SQLITE_ABORT;
1645 }
1646 assert( pPage->isInit );
1647 assert( pCur->eSkip!=SKIP_INVALID );
1648 if( pPage->nCell==0 ){
1649 *pRes = 1;
1650 return SQLITE_OK;
1651 }
1652 assert( pCur->idx<pPage->nCell );
1653 if( pCur->eSkip==SKIP_NEXT ){
1654 pCur->eSkip = SKIP_NONE;
1655 *pRes = 0;
1656 return SQLITE_OK;
1657 }
1658 pCur->eSkip = SKIP_NONE;
1659 pCur->idx++;
1660 if( pCur->idx>=pPage->nCell ){
1661 if( pPage->u.hdr.rightChild ){
1662 rc = moveToChild(pCur, pPage->u.hdr.rightChild);
1663 if( rc ) return rc;
1664 rc = moveToLeftmost(pCur);
1665 *pRes = 0;
1666 return rc;
1667 }
1668 do{
1669 if( pPage->pParent==0 ){
1670 *pRes = 1;
1671 return SQLITE_OK;
1672 }
1673 moveToParent(pCur);
1674 pPage = pCur->pPage;
1675 }while( pCur->idx>=pPage->nCell );
1676 *pRes = 0;
1677 return SQLITE_OK;
1678 }
1679 *pRes = 0;
1680 if( pPage->u.hdr.rightChild==0 ){
1681 return SQLITE_OK;
1682 }
1683 rc = moveToLeftmost(pCur);
1684 return rc;
1685}
1686
1687/*
1688** Step the cursor to the back to the previous entry in the database. If
1689** successful then set *pRes=0. If the cursor
1690** was already pointing to the first entry in the database before
1691** this routine was called, then set *pRes=1.
1692*/
1693static int fileBtreePrevious(BtCursor *pCur, int *pRes){
1694 int rc;
1695 Pgno pgno;
1696 MemPage *pPage;
1697 pPage = pCur->pPage;
1698 if( pPage==0 ){
1699 *pRes = 1;
1700 return SQLITE_ABORT;
1701 }
1702 assert( pPage->isInit );
1703 assert( pCur->eSkip!=SKIP_INVALID );
1704 if( pPage->nCell==0 ){
1705 *pRes = 1;
1706 return SQLITE_OK;
1707 }
1708 if( pCur->eSkip==SKIP_PREV ){
1709 pCur->eSkip = SKIP_NONE;
1710 *pRes = 0;
1711 return SQLITE_OK;
1712 }
1713 pCur->eSkip = SKIP_NONE;
1714 assert( pCur->idx>=0 );
1715 if( (pgno = pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
1716 rc = moveToChild(pCur, pgno);
1717 if( rc ) return rc;
1718 rc = moveToRightmost(pCur);
1719 }else{
1720 while( pCur->idx==0 ){
1721 if( pPage->pParent==0 ){
1722 if( pRes ) *pRes = 1;
1723 return SQLITE_OK;
1724 }
1725 moveToParent(pCur);
1726 pPage = pCur->pPage;
1727 }
1728 pCur->idx--;
1729 rc = SQLITE_OK;
1730 }
1731 *pRes = 0;
1732 return rc;
1733}
1734
1735/*
1736** Allocate a new page from the database file.
1737**
1738** The new page is marked as dirty. (In other words, sqlitepager_write()
1739** has already been called on the new page.) The new page has also
1740** been referenced and the calling routine is responsible for calling
1741** sqlitepager_unref() on the new page when it is done.
1742**
1743** SQLITE_OK is returned on success. Any other return value indicates
1744** an error. *ppPage and *pPgno are undefined in the event of an error.
1745** Do not invoke sqlitepager_unref() on *ppPage if an error is returned.
1746**
1747** If the "nearby" parameter is not 0, then a (feeble) effort is made to
1748** locate a page close to the page number "nearby". This can be used in an
1749** attempt to keep related pages close to each other in the database file,
1750** which in turn can make database access faster.
1751*/
1752static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
1753 PageOne *pPage1 = pBt->page1;
1754 int rc;
1755 if( pPage1->freeList ){
1756 OverflowPage *pOvfl;
1757 FreelistInfo *pInfo;
1758
1759 rc = sqlitepager_write(pPage1);
1760 if( rc ) return rc;
1761 SWAB_ADD(pBt, pPage1->nFree, -1);
1762 rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList),
1763 (void**)&pOvfl);
1764 if( rc ) return rc;
1765 rc = sqlitepager_write(pOvfl);
1766 if( rc ){
1767 sqlitepager_unref(pOvfl);
1768 return rc;
1769 }
1770 pInfo = (FreelistInfo*)pOvfl->aPayload;
1771 if( pInfo->nFree==0 ){
1772 *pPgno = SWAB32(pBt, pPage1->freeList);
1773 pPage1->freeList = pOvfl->iNext;
1774 *ppPage = (MemPage*)pOvfl;
1775 }else{
1776 int closest, n;
1777 n = SWAB32(pBt, pInfo->nFree);
1778 if( n>1 && nearby>0 ){
1779 int i, dist;
1780 closest = 0;
1781 dist = SWAB32(pBt, pInfo->aFree[0]) - nearby;
1782 if( dist<0 ) dist = -dist;
1783 for(i=1; i<n; i++){
1784 int d2 = SWAB32(pBt, pInfo->aFree[i]) - nearby;
1785 if( d2<0 ) d2 = -d2;
1786 if( d2<dist ) closest = i;
1787 }
1788 }else{
1789 closest = 0;
1790 }
1791 SWAB_ADD(pBt, pInfo->nFree, -1);
1792 *pPgno = SWAB32(pBt, pInfo->aFree[closest]);
1793 pInfo->aFree[closest] = pInfo->aFree[n-1];
1794 rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage);
1795 sqlitepager_unref(pOvfl);
1796 if( rc==SQLITE_OK ){
1797 sqlitepager_dont_rollback(*ppPage);
1798 rc = sqlitepager_write(*ppPage);
1799 }
1800 }
1801 }else{
1802 *pPgno = sqlitepager_pagecount(pBt->pPager) + 1;
1803 rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage);
1804 if( rc ) return rc;
1805 rc = sqlitepager_write(*ppPage);
1806 }
1807 return rc;
1808}
1809
1810/*
1811** Add a page of the database file to the freelist. Either pgno or
1812** pPage but not both may be 0.
1813**
1814** sqlitepager_unref() is NOT called for pPage.
1815*/
1816static int freePage(Btree *pBt, void *pPage, Pgno pgno){
1817 PageOne *pPage1 = pBt->page1;
1818 OverflowPage *pOvfl = (OverflowPage*)pPage;
1819 int rc;
1820 int needUnref = 0;
1821 MemPage *pMemPage;
1822
1823 if( pgno==0 ){
1824 assert( pOvfl!=0 );
1825 pgno = sqlitepager_pagenumber(pOvfl);
1826 }
1827 assert( pgno>2 );
1828 assert( sqlitepager_pagenumber(pOvfl)==pgno );
1829 pMemPage = (MemPage*)pPage;
1830 pMemPage->isInit = 0;
1831 if( pMemPage->pParent ){
1832 sqlitepager_unref(pMemPage->pParent);
1833 pMemPage->pParent = 0;
1834 }
1835 rc = sqlitepager_write(pPage1);
1836 if( rc ){
1837 return rc;
1838 }
1839 SWAB_ADD(pBt, pPage1->nFree, 1);
1840 if( pPage1->nFree!=0 && pPage1->freeList!=0 ){
1841 OverflowPage *pFreeIdx;
1842 rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList),
1843 (void**)&pFreeIdx);
1844 if( rc==SQLITE_OK ){
1845 FreelistInfo *pInfo = (FreelistInfo*)pFreeIdx->aPayload;
1846 int n = SWAB32(pBt, pInfo->nFree);
1847 if( n<(sizeof(pInfo->aFree)/sizeof(pInfo->aFree[0])) ){
1848 rc = sqlitepager_write(pFreeIdx);
1849 if( rc==SQLITE_OK ){
1850 pInfo->aFree[n] = SWAB32(pBt, pgno);
1851 SWAB_ADD(pBt, pInfo->nFree, 1);
1852 sqlitepager_unref(pFreeIdx);
1853 sqlitepager_dont_write(pBt->pPager, pgno);
1854 return rc;
1855 }
1856 }
1857 sqlitepager_unref(pFreeIdx);
1858 }
1859 }
1860 if( pOvfl==0 ){
1861 assert( pgno>0 );
1862 rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pOvfl);
1863 if( rc ) return rc;
1864 needUnref = 1;
1865 }
1866 rc = sqlitepager_write(pOvfl);
1867 if( rc ){
1868 if( needUnref ) sqlitepager_unref(pOvfl);
1869 return rc;
1870 }
1871 pOvfl->iNext = pPage1->freeList;
1872 pPage1->freeList = SWAB32(pBt, pgno);
1873 memset(pOvfl->aPayload, 0, OVERFLOW_SIZE);
1874 if( needUnref ) rc = sqlitepager_unref(pOvfl);
1875 return rc;
1876}
1877
1878/*
1879** Erase all the data out of a cell. This involves returning overflow
1880** pages back the freelist.
1881*/
1882static int clearCell(Btree *pBt, Cell *pCell){
1883 Pager *pPager = pBt->pPager;
1884 OverflowPage *pOvfl;
1885 Pgno ovfl, nextOvfl;
1886 int rc;
1887
1888 if( NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h) <= MX_LOCAL_PAYLOAD ){
1889 return SQLITE_OK;
1890 }
1891 ovfl = SWAB32(pBt, pCell->ovfl);
1892 pCell->ovfl = 0;
1893 while( ovfl ){
1894 rc = sqlitepager_get(pPager, ovfl, (void**)&pOvfl);
1895 if( rc ) return rc;
1896 nextOvfl = SWAB32(pBt, pOvfl->iNext);
1897 rc = freePage(pBt, pOvfl, ovfl);
1898 if( rc ) return rc;
1899 sqlitepager_unref(pOvfl);
1900 ovfl = nextOvfl;
1901 }
1902 return SQLITE_OK;
1903}
1904
1905/*
1906** Create a new cell from key and data. Overflow pages are allocated as
1907** necessary and linked to this cell.
1908*/
1909static int fillInCell(
1910 Btree *pBt, /* The whole Btree. Needed to allocate pages */
1911 Cell *pCell, /* Populate this Cell structure */
1912 const void *pKey, int nKey, /* The key */
1913 const void *pData,int nData /* The data */
1914){
1915 OverflowPage *pOvfl, *pPrior;
1916 Pgno *pNext;
1917 int spaceLeft;
1918 int n, rc;
1919 int nPayload;
1920 const char *pPayload;
1921 char *pSpace;
1922 Pgno nearby = 0;
1923
1924 pCell->h.leftChild = 0;
1925 pCell->h.nKey = SWAB16(pBt, nKey & 0xffff);
1926 pCell->h.nKeyHi = nKey >> 16;
1927 pCell->h.nData = SWAB16(pBt, nData & 0xffff);
1928 pCell->h.nDataHi = nData >> 16;
1929 pCell->h.iNext = 0;
1930
1931 pNext = &pCell->ovfl;
1932 pSpace = pCell->aPayload;
1933 spaceLeft = MX_LOCAL_PAYLOAD;
1934 pPayload = pKey;
1935 pKey = 0;
1936 nPayload = nKey;
1937 pPrior = 0;
1938 while( nPayload>0 ){
1939 if( spaceLeft==0 ){
1940 rc = allocatePage(pBt, (MemPage**)&pOvfl, pNext, nearby);
1941 if( rc ){
1942 *pNext = 0;
1943 }else{
1944 nearby = *pNext;
1945 }
1946 if( pPrior ) sqlitepager_unref(pPrior);
1947 if( rc ){
1948 clearCell(pBt, pCell);
1949 return rc;
1950 }
1951 if( pBt->needSwab ) *pNext = swab32(*pNext);
1952 pPrior = pOvfl;
1953 spaceLeft = OVERFLOW_SIZE;
1954 pSpace = pOvfl->aPayload;
1955 pNext = &pOvfl->iNext;
1956 }
1957 n = nPayload;
1958 if( n>spaceLeft ) n = spaceLeft;
1959 memcpy(pSpace, pPayload, n);
1960 nPayload -= n;
1961 if( nPayload==0 && pData ){
1962 pPayload = pData;
1963 nPayload = nData;
1964 pData = 0;
1965 }else{
1966 pPayload += n;
1967 }
1968 spaceLeft -= n;
1969 pSpace += n;
1970 }
1971 *pNext = 0;
1972 if( pPrior ){
1973 sqlitepager_unref(pPrior);
1974 }
1975 return SQLITE_OK;
1976}
1977
1978/*
1979** Change the MemPage.pParent pointer on the page whose number is
1980** given in the second argument so that MemPage.pParent holds the
1981** pointer in the third argument.
1982*/
1983static void reparentPage(Pager *pPager, Pgno pgno, MemPage *pNewParent,int idx){
1984 MemPage *pThis;
1985
1986 if( pgno==0 ) return;
1987 assert( pPager!=0 );
1988 pThis = sqlitepager_lookup(pPager, pgno);
1989 if( pThis && pThis->isInit ){
1990 if( pThis->pParent!=pNewParent ){
1991 if( pThis->pParent ) sqlitepager_unref(pThis->pParent);
1992 pThis->pParent = pNewParent;
1993 if( pNewParent ) sqlitepager_ref(pNewParent);
1994 }
1995 pThis->idxParent = idx;
1996 sqlitepager_unref(pThis);
1997 }
1998}
1999
2000/*
2001** Reparent all children of the given page to be the given page.
2002** In other words, for every child of pPage, invoke reparentPage()
2003** to make sure that each child knows that pPage is its parent.
2004**
2005** This routine gets called after you memcpy() one page into
2006** another.
2007*/
2008static void reparentChildPages(Btree *pBt, MemPage *pPage){
2009 int i;
2010 Pager *pPager = pBt->pPager;
2011 for(i=0; i<pPage->nCell; i++){
2012 reparentPage(pPager, SWAB32(pBt, pPage->apCell[i]->h.leftChild), pPage, i);
2013 }
2014 reparentPage(pPager, SWAB32(pBt, pPage->u.hdr.rightChild), pPage, i);
2015 pPage->idxShift = 0;
2016}
2017
2018/*
2019** Remove the i-th cell from pPage. This routine effects pPage only.
2020** The cell content is not freed or deallocated. It is assumed that
2021** the cell content has been copied someplace else. This routine just
2022** removes the reference to the cell from pPage.
2023**
2024** "sz" must be the number of bytes in the cell.
2025**
2026** Do not bother maintaining the integrity of the linked list of Cells.
2027** Only the pPage->apCell[] array is important. The relinkCellList()
2028** routine will be called soon after this routine in order to rebuild
2029** the linked list.
2030*/
2031static void dropCell(Btree *pBt, MemPage *pPage, int idx, int sz){
2032 int j;
2033 assert( idx>=0 && idx<pPage->nCell );
2034 assert( sz==cellSize(pBt, pPage->apCell[idx]) );
2035 assert( sqlitepager_iswriteable(pPage) );
2036 freeSpace(pBt, pPage, Addr(pPage->apCell[idx]) - Addr(pPage), sz);
2037 for(j=idx; j<pPage->nCell-1; j++){
2038 pPage->apCell[j] = pPage->apCell[j+1];
2039 }
2040 pPage->nCell--;
2041 pPage->idxShift = 1;
2042}
2043
2044/*
2045** Insert a new cell on pPage at cell index "i". pCell points to the
2046** content of the cell.
2047**
2048** If the cell content will fit on the page, then put it there. If it
2049** will not fit, then just make pPage->apCell[i] point to the content
2050** and set pPage->isOverfull.
2051**
2052** Do not bother maintaining the integrity of the linked list of Cells.
2053** Only the pPage->apCell[] array is important. The relinkCellList()
2054** routine will be called soon after this routine in order to rebuild
2055** the linked list.
2056*/
2057static void insertCell(Btree *pBt, MemPage *pPage, int i, Cell *pCell, int sz){
2058 int idx, j;
2059 assert( i>=0 && i<=pPage->nCell );
2060 assert( sz==cellSize(pBt, pCell) );
2061 assert( sqlitepager_iswriteable(pPage) );
2062 idx = allocateSpace(pBt, pPage, sz);
2063 for(j=pPage->nCell; j>i; j--){
2064 pPage->apCell[j] = pPage->apCell[j-1];
2065 }
2066 pPage->nCell++;
2067 if( idx<=0 ){
2068 pPage->isOverfull = 1;
2069 pPage->apCell[i] = pCell;
2070 }else{
2071 memcpy(&pPage->u.aDisk[idx], pCell, sz);
2072 pPage->apCell[i] = (Cell*)&pPage->u.aDisk[idx];
2073 }
2074 pPage->idxShift = 1;
2075}
2076
2077/*
2078** Rebuild the linked list of cells on a page so that the cells
2079** occur in the order specified by the pPage->apCell[] array.
2080** Invoke this routine once to repair damage after one or more
2081** invocations of either insertCell() or dropCell().
2082*/
2083static void relinkCellList(Btree *pBt, MemPage *pPage){
2084 int i;
2085 u16 *pIdx;
2086 assert( sqlitepager_iswriteable(pPage) );
2087 pIdx = &pPage->u.hdr.firstCell;
2088 for(i=0; i<pPage->nCell; i++){
2089 int idx = Addr(pPage->apCell[i]) - Addr(pPage);
2090 assert( idx>0 && idx<SQLITE_USABLE_SIZE );
2091 *pIdx = SWAB16(pBt, idx);
2092 pIdx = &pPage->apCell[i]->h.iNext;
2093 }
2094 *pIdx = 0;
2095}
2096
2097/*
2098** Make a copy of the contents of pFrom into pTo. The pFrom->apCell[]
2099** pointers that point into pFrom->u.aDisk[] must be adjusted to point
2100** into pTo->u.aDisk[] instead. But some pFrom->apCell[] entries might
2101** not point to pFrom->u.aDisk[]. Those are unchanged.
2102*/
2103static void copyPage(MemPage *pTo, MemPage *pFrom){
2104 uptr from, to;
2105 int i;
2106 memcpy(pTo->u.aDisk, pFrom->u.aDisk, SQLITE_USABLE_SIZE);
2107 pTo->pParent = 0;
2108 pTo->isInit = 1;
2109 pTo->nCell = pFrom->nCell;
2110 pTo->nFree = pFrom->nFree;
2111 pTo->isOverfull = pFrom->isOverfull;
2112 to = Addr(pTo);
2113 from = Addr(pFrom);
2114 for(i=0; i<pTo->nCell; i++){
2115 uptr x = Addr(pFrom->apCell[i]);
2116 if( x>from && x<from+SQLITE_USABLE_SIZE ){
2117 *((uptr*)&pTo->apCell[i]) = x + to - from;
2118 }else{
2119 pTo->apCell[i] = pFrom->apCell[i];
2120 }
2121 }
2122}
2123
2124/*
2125** The following parameters determine how many adjacent pages get involved
2126** in a balancing operation. NN is the number of neighbors on either side
2127** of the page that participate in the balancing operation. NB is the
2128** total number of pages that participate, including the target page and
2129** NN neighbors on either side.
2130**
2131** The minimum value of NN is 1 (of course). Increasing NN above 1
2132** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2133** in exchange for a larger degradation in INSERT and UPDATE performance.
2134** The value of NN appears to give the best results overall.
2135*/
2136#define NN 1 /* Number of neighbors on either side of pPage */
2137#define NB (NN*2+1) /* Total pages involved in the balance */
2138
2139/*
2140** This routine redistributes Cells on pPage and up to two siblings
2141** of pPage so that all pages have about the same amount of free space.
2142** Usually one sibling on either side of pPage is used in the balancing,
2143** though both siblings might come from one side if pPage is the first
2144** or last child of its parent. If pPage has fewer than two siblings
2145** (something which can only happen if pPage is the root page or a
2146** child of root) then all available siblings participate in the balancing.
2147**
2148** The number of siblings of pPage might be increased or decreased by
2149** one in an effort to keep pages between 66% and 100% full. The root page
2150** is special and is allowed to be less than 66% full. If pPage is
2151** the root page, then the depth of the tree might be increased
2152** or decreased by one, as necessary, to keep the root page from being
2153** overfull or empty.
2154**
2155** This routine calls relinkCellList() on its input page regardless of
2156** whether or not it does any real balancing. Client routines will typically
2157** invoke insertCell() or dropCell() before calling this routine, so we
2158** need to call relinkCellList() to clean up the mess that those other
2159** routines left behind.
2160**
2161** pCur is left pointing to the same cell as when this routine was called
2162** even if that cell gets moved to a different page. pCur may be NULL.
2163** Set the pCur parameter to NULL if you do not care about keeping track
2164** of a cell as that will save this routine the work of keeping track of it.
2165**
2166** Note that when this routine is called, some of the Cells on pPage
2167** might not actually be stored in pPage->u.aDisk[]. This can happen
2168** if the page is overfull. Part of the job of this routine is to
2169** make sure all Cells for pPage once again fit in pPage->u.aDisk[].
2170**
2171** In the course of balancing the siblings of pPage, the parent of pPage
2172** might become overfull or underfull. If that happens, then this routine
2173** is called recursively on the parent.
2174**
2175** If this routine fails for any reason, it might leave the database
2176** in a corrupted state. So if this routine fails, the database should
2177** be rolled back.
2178*/
2179static int balance(Btree *pBt, MemPage *pPage, BtCursor *pCur){
2180 MemPage *pParent; /* The parent of pPage */
2181 int nCell; /* Number of cells in apCell[] */
2182 int nOld; /* Number of pages in apOld[] */
2183 int nNew; /* Number of pages in apNew[] */
2184 int nDiv; /* Number of cells in apDiv[] */
2185 int i, j, k; /* Loop counters */
2186 int idx; /* Index of pPage in pParent->apCell[] */
2187 int nxDiv; /* Next divider slot in pParent->apCell[] */
2188 int rc; /* The return code */
2189 int iCur; /* apCell[iCur] is the cell of the cursor */
2190 MemPage *pOldCurPage; /* The cursor originally points to this page */
2191 int subtotal; /* Subtotal of bytes in cells on one page */
2192 MemPage *extraUnref = 0; /* A page that needs to be unref-ed */
2193 MemPage *apOld[NB]; /* pPage and up to two siblings */
2194 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
2195 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */
2196 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */
2197 int idxDiv[NB]; /* Indices of divider cells in pParent */
2198 Cell *apDiv[NB]; /* Divider cells in pParent */
2199 Cell aTemp[NB]; /* Temporary holding area for apDiv[] */
2200 int cntNew[NB+1]; /* Index in apCell[] of cell after i-th page */
2201 int szNew[NB+1]; /* Combined size of cells place on i-th page */
2202 MemPage aOld[NB]; /* Temporary copies of pPage and its siblings */
2203 Cell *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
2204 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */
2205
2206 /*
2207 ** Return without doing any work if pPage is neither overfull nor
2208 ** underfull.
2209 */
2210 assert( sqlitepager_iswriteable(pPage) );
2211 if( !pPage->isOverfull && pPage->nFree<SQLITE_USABLE_SIZE/2
2212 && pPage->nCell>=2){
2213 relinkCellList(pBt, pPage);
2214 return SQLITE_OK;
2215 }
2216
2217 /*
2218 ** Find the parent of the page to be balanceed.
2219 ** If there is no parent, it means this page is the root page and
2220 ** special rules apply.
2221 */
2222 pParent = pPage->pParent;
2223 if( pParent==0 ){
2224 Pgno pgnoChild;
2225 MemPage *pChild;
2226 assert( pPage->isInit );
2227 if( pPage->nCell==0 ){
2228 if( pPage->u.hdr.rightChild ){
2229 /*
2230 ** The root page is empty. Copy the one child page
2231 ** into the root page and return. This reduces the depth
2232 ** of the BTree by one.
2233 */
2234 pgnoChild = SWAB32(pBt, pPage->u.hdr.rightChild);
2235 rc = sqlitepager_get(pBt->pPager, pgnoChild, (void**)&pChild);
2236 if( rc ) return rc;
2237 memcpy(pPage, pChild, SQLITE_USABLE_SIZE);
2238 pPage->isInit = 0;
2239 rc = initPage(pBt, pPage, sqlitepager_pagenumber(pPage), 0);
2240 assert( rc==SQLITE_OK );
2241 reparentChildPages(pBt, pPage);
2242 if( pCur && pCur->pPage==pChild ){
2243 sqlitepager_unref(pChild);
2244 pCur->pPage = pPage;
2245 sqlitepager_ref(pPage);
2246 }
2247 freePage(pBt, pChild, pgnoChild);
2248 sqlitepager_unref(pChild);
2249 }else{
2250 relinkCellList(pBt, pPage);
2251 }
2252 return SQLITE_OK;
2253 }
2254 if( !pPage->isOverfull ){
2255 /* It is OK for the root page to be less than half full.
2256 */
2257 relinkCellList(pBt, pPage);
2258 return SQLITE_OK;
2259 }
2260 /*
2261 ** If we get to here, it means the root page is overfull.
2262 ** When this happens, Create a new child page and copy the
2263 ** contents of the root into the child. Then make the root
2264 ** page an empty page with rightChild pointing to the new
2265 ** child. Then fall thru to the code below which will cause
2266 ** the overfull child page to be split.
2267 */
2268 rc = sqlitepager_write(pPage);
2269 if( rc ) return rc;
2270 rc = allocatePage(pBt, &pChild, &pgnoChild, sqlitepager_pagenumber(pPage));
2271 if( rc ) return rc;
2272 assert( sqlitepager_iswriteable(pChild) );
2273 copyPage(pChild, pPage);
2274 pChild->pParent = pPage;
2275 pChild->idxParent = 0;
2276 sqlitepager_ref(pPage);
2277 pChild->isOverfull = 1;
2278 if( pCur && pCur->pPage==pPage ){
2279 sqlitepager_unref(pPage);
2280 pCur->pPage = pChild;
2281 }else{
2282 extraUnref = pChild;
2283 }
2284 zeroPage(pBt, pPage);
2285 pPage->u.hdr.rightChild = SWAB32(pBt, pgnoChild);
2286 pParent = pPage;
2287 pPage = pChild;
2288 }
2289 rc = sqlitepager_write(pParent);
2290 if( rc ) return rc;
2291 assert( pParent->isInit );
2292
2293 /*
2294 ** Find the Cell in the parent page whose h.leftChild points back
2295 ** to pPage. The "idx" variable is the index of that cell. If pPage
2296 ** is the rightmost child of pParent then set idx to pParent->nCell
2297 */
2298 if( pParent->idxShift ){
2299 Pgno pgno, swabPgno;
2300 pgno = sqlitepager_pagenumber(pPage);
2301 swabPgno = SWAB32(pBt, pgno);
2302 for(idx=0; idx<pParent->nCell; idx++){
2303 if( pParent->apCell[idx]->h.leftChild==swabPgno ){
2304 break;
2305 }
2306 }
2307 assert( idx<pParent->nCell || pParent->u.hdr.rightChild==swabPgno );
2308 }else{
2309 idx = pPage->idxParent;
2310 }
2311
2312 /*
2313 ** Initialize variables so that it will be safe to jump
2314 ** directly to balance_cleanup at any moment.
2315 */
2316 nOld = nNew = 0;
2317 sqlitepager_ref(pParent);
2318
2319 /*
2320 ** Find sibling pages to pPage and the Cells in pParent that divide
2321 ** the siblings. An attempt is made to find NN siblings on either
2322 ** side of pPage. More siblings are taken from one side, however, if
2323 ** pPage there are fewer than NN siblings on the other side. If pParent
2324 ** has NB or fewer children then all children of pParent are taken.
2325 */
2326 nxDiv = idx - NN;
2327 if( nxDiv + NB > pParent->nCell ){
2328 nxDiv = pParent->nCell - NB + 1;
2329 }
2330 if( nxDiv<0 ){
2331 nxDiv = 0;
2332 }
2333 nDiv = 0;
2334 for(i=0, k=nxDiv; i<NB; i++, k++){
2335 if( k<pParent->nCell ){
2336 idxDiv[i] = k;
2337 apDiv[i] = pParent->apCell[k];
2338 nDiv++;
2339 pgnoOld[i] = SWAB32(pBt, apDiv[i]->h.leftChild);
2340 }else if( k==pParent->nCell ){
2341 pgnoOld[i] = SWAB32(pBt, pParent->u.hdr.rightChild);
2342 }else{
2343 break;
2344 }
2345 rc = sqlitepager_get(pBt->pPager, pgnoOld[i], (void**)&apOld[i]);
2346 if( rc ) goto balance_cleanup;
2347 rc = initPage(pBt, apOld[i], pgnoOld[i], pParent);
2348 if( rc ) goto balance_cleanup;
2349 apOld[i]->idxParent = k;
2350 nOld++;
2351 }
2352
2353 /*
2354 ** Set iCur to be the index in apCell[] of the cell that the cursor
2355 ** is pointing to. We will need this later on in order to keep the
2356 ** cursor pointing at the same cell. If pCur points to a page that
2357 ** has no involvement with this rebalancing, then set iCur to a large
2358 ** number so that the iCur==j tests always fail in the main cell
2359 ** distribution loop below.
2360 */
2361 if( pCur ){
2362 iCur = 0;
2363 for(i=0; i<nOld; i++){
2364 if( pCur->pPage==apOld[i] ){
2365 iCur += pCur->idx;
2366 break;
2367 }
2368 iCur += apOld[i]->nCell;
2369 if( i<nOld-1 && pCur->pPage==pParent && pCur->idx==idxDiv[i] ){
2370 break;
2371 }
2372 iCur++;
2373 }
2374 pOldCurPage = pCur->pPage;
2375 }
2376
2377 /*
2378 ** Make copies of the content of pPage and its siblings into aOld[].
2379 ** The rest of this function will use data from the copies rather
2380 ** that the original pages since the original pages will be in the
2381 ** process of being overwritten.
2382 */
2383 for(i=0; i<nOld; i++){
2384 copyPage(&aOld[i], apOld[i]);
2385 }
2386
2387 /*
2388 ** Load pointers to all cells on sibling pages and the divider cells
2389 ** into the local apCell[] array. Make copies of the divider cells
2390 ** into aTemp[] and remove the the divider Cells from pParent.
2391 */
2392 nCell = 0;
2393 for(i=0; i<nOld; i++){
2394 MemPage *pOld = &aOld[i];
2395 for(j=0; j<pOld->nCell; j++){
2396 apCell[nCell] = pOld->apCell[j];
2397 szCell[nCell] = cellSize(pBt, apCell[nCell]);
2398 nCell++;
2399 }
2400 if( i<nOld-1 ){
2401 szCell[nCell] = cellSize(pBt, apDiv[i]);
2402 memcpy(&aTemp[i], apDiv[i], szCell[nCell]);
2403 apCell[nCell] = &aTemp[i];
2404 dropCell(pBt, pParent, nxDiv, szCell[nCell]);
2405 assert( SWAB32(pBt, apCell[nCell]->h.leftChild)==pgnoOld[i] );
2406 apCell[nCell]->h.leftChild = pOld->u.hdr.rightChild;
2407 nCell++;
2408 }
2409 }
2410
2411 /*
2412 ** Figure out the number of pages needed to hold all nCell cells.
2413 ** Store this number in "k". Also compute szNew[] which is the total
2414 ** size of all cells on the i-th page and cntNew[] which is the index
2415 ** in apCell[] of the cell that divides path i from path i+1.
2416 ** cntNew[k] should equal nCell.
2417 **
2418 ** This little patch of code is critical for keeping the tree
2419 ** balanced.
2420 */
2421 for(subtotal=k=i=0; i<nCell; i++){
2422 subtotal += szCell[i];
2423 if( subtotal > USABLE_SPACE ){
2424 szNew[k] = subtotal - szCell[i];
2425 cntNew[k] = i;
2426 subtotal = 0;
2427 k++;
2428 }
2429 }
2430 szNew[k] = subtotal;
2431 cntNew[k] = nCell;
2432 k++;
2433 for(i=k-1; i>0; i--){
2434 while( szNew[i]<USABLE_SPACE/2 ){
2435 cntNew[i-1]--;
2436 assert( cntNew[i-1]>0 );
2437 szNew[i] += szCell[cntNew[i-1]];
2438 szNew[i-1] -= szCell[cntNew[i-1]-1];
2439 }
2440 }
2441 assert( cntNew[0]>0 );
2442
2443 /*
2444 ** Allocate k new pages. Reuse old pages where possible.
2445 */
2446 for(i=0; i<k; i++){
2447 if( i<nOld ){
2448 apNew[i] = apOld[i];
2449 pgnoNew[i] = pgnoOld[i];
2450 apOld[i] = 0;
2451 sqlitepager_write(apNew[i]);
2452 }else{
2453 rc = allocatePage(pBt, &apNew[i], &pgnoNew[i], pgnoNew[i-1]);
2454 if( rc ) goto balance_cleanup;
2455 }
2456 nNew++;
2457 zeroPage(pBt, apNew[i]);
2458 apNew[i]->isInit = 1;
2459 }
2460
2461 /* Free any old pages that were not reused as new pages.
2462 */
2463 while( i<nOld ){
2464 rc = freePage(pBt, apOld[i], pgnoOld[i]);
2465 if( rc ) goto balance_cleanup;
2466 sqlitepager_unref(apOld[i]);
2467 apOld[i] = 0;
2468 i++;
2469 }
2470
2471 /*
2472 ** Put the new pages in accending order. This helps to
2473 ** keep entries in the disk file in order so that a scan
2474 ** of the table is a linear scan through the file. That
2475 ** in turn helps the operating system to deliver pages
2476 ** from the disk more rapidly.
2477 **
2478 ** An O(n^2) insertion sort algorithm is used, but since
2479 ** n is never more than NB (a small constant), that should
2480 ** not be a problem.
2481 **
2482 ** When NB==3, this one optimization makes the database
2483 ** about 25% faster for large insertions and deletions.
2484 */
2485 for(i=0; i<k-1; i++){
2486 int minV = pgnoNew[i];
2487 int minI = i;
2488 for(j=i+1; j<k; j++){
2489 if( pgnoNew[j]<(unsigned)minV ){
2490 minI = j;
2491 minV = pgnoNew[j];
2492 }
2493 }
2494 if( minI>i ){
2495 int t;
2496 MemPage *pT;
2497 t = pgnoNew[i];
2498 pT = apNew[i];
2499 pgnoNew[i] = pgnoNew[minI];
2500 apNew[i] = apNew[minI];
2501 pgnoNew[minI] = t;
2502 apNew[minI] = pT;
2503 }
2504 }
2505
2506 /*
2507 ** Evenly distribute the data in apCell[] across the new pages.
2508 ** Insert divider cells into pParent as necessary.
2509 */
2510 j = 0;
2511 for(i=0; i<nNew; i++){
2512 MemPage *pNew = apNew[i];
2513 while( j<cntNew[i] ){
2514 assert( pNew->nFree>=szCell[j] );
2515 if( pCur && iCur==j ){ pCur->pPage = pNew; pCur->idx = pNew->nCell; }
2516 insertCell(pBt, pNew, pNew->nCell, apCell[j], szCell[j]);
2517 j++;
2518 }
2519 assert( pNew->nCell>0 );
2520 assert( !pNew->isOverfull );
2521 relinkCellList(pBt, pNew);
2522 if( i<nNew-1 && j<nCell ){
2523 pNew->u.hdr.rightChild = apCell[j]->h.leftChild;
2524 apCell[j]->h.leftChild = SWAB32(pBt, pgnoNew[i]);
2525 if( pCur && iCur==j ){ pCur->pPage = pParent; pCur->idx = nxDiv; }
2526 insertCell(pBt, pParent, nxDiv, apCell[j], szCell[j]);
2527 j++;
2528 nxDiv++;
2529 }
2530 }
2531 assert( j==nCell );
2532 apNew[nNew-1]->u.hdr.rightChild = aOld[nOld-1].u.hdr.rightChild;
2533 if( nxDiv==pParent->nCell ){
2534 pParent->u.hdr.rightChild = SWAB32(pBt, pgnoNew[nNew-1]);
2535 }else{
2536 pParent->apCell[nxDiv]->h.leftChild = SWAB32(pBt, pgnoNew[nNew-1]);
2537 }
2538 if( pCur ){
2539 if( j<=iCur && pCur->pPage==pParent && pCur->idx>idxDiv[nOld-1] ){
2540 assert( pCur->pPage==pOldCurPage );
2541 pCur->idx += nNew - nOld;
2542 }else{
2543 assert( pOldCurPage!=0 );
2544 sqlitepager_ref(pCur->pPage);
2545 sqlitepager_unref(pOldCurPage);
2546 }
2547 }
2548
2549 /*
2550 ** Reparent children of all cells.
2551 */
2552 for(i=0; i<nNew; i++){
2553 reparentChildPages(pBt, apNew[i]);
2554 }
2555 reparentChildPages(pBt, pParent);
2556
2557 /*
2558 ** balance the parent page.
2559 */
2560 rc = balance(pBt, pParent, pCur);
2561
2562 /*
2563 ** Cleanup before returning.
2564 */
2565balance_cleanup:
2566 if( extraUnref ){
2567 sqlitepager_unref(extraUnref);
2568 }
2569 for(i=0; i<nOld; i++){
2570 if( apOld[i]!=0 && apOld[i]!=&aOld[i] ) sqlitepager_unref(apOld[i]);
2571 }
2572 for(i=0; i<nNew; i++){
2573 sqlitepager_unref(apNew[i]);
2574 }
2575 if( pCur && pCur->pPage==0 ){
2576 pCur->pPage = pParent;
2577 pCur->idx = 0;
2578 }else{
2579 sqlitepager_unref(pParent);
2580 }
2581 return rc;
2582}
2583
2584/*
2585** This routine checks all cursors that point to the same table
2586** as pCur points to. If any of those cursors were opened with
2587** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
2588** cursors point to the same table were opened with wrFlag==1
2589** then this routine returns SQLITE_OK.
2590**
2591** In addition to checking for read-locks (where a read-lock
2592** means a cursor opened with wrFlag==0) this routine also moves
2593** all cursors other than pCur so that they are pointing to the
2594** first Cell on root page. This is necessary because an insert
2595** or delete might change the number of cells on a page or delete
2596** a page entirely and we do not want to leave any cursors
2597** pointing to non-existant pages or cells.
2598*/
2599static int checkReadLocks(BtCursor *pCur){
2600 BtCursor *p;
2601 assert( pCur->wrFlag );
2602 for(p=pCur->pShared; p!=pCur; p=p->pShared){
2603 assert( p );
2604 assert( p->pgnoRoot==pCur->pgnoRoot );
2605 if( p->wrFlag==0 ) return SQLITE_LOCKED;
2606 if( sqlitepager_pagenumber(p->pPage)!=p->pgnoRoot ){
2607 moveToRoot(p);
2608 }
2609 }
2610 return SQLITE_OK;
2611}
2612
2613/*
2614** Insert a new record into the BTree. The key is given by (pKey,nKey)
2615** and the data is given by (pData,nData). The cursor is used only to
2616** define what database the record should be inserted into. The cursor
2617** is left pointing at the new record.
2618*/
2619static int fileBtreeInsert(
2620 BtCursor *pCur, /* Insert data into the table of this cursor */
2621 const void *pKey, int nKey, /* The key of the new record */
2622 const void *pData, int nData /* The data of the new record */
2623){
2624 Cell newCell;
2625 int rc;
2626 int loc;
2627 int szNew;
2628 MemPage *pPage;
2629 Btree *pBt = pCur->pBt;
2630
2631 if( pCur->pPage==0 ){
2632 return SQLITE_ABORT; /* A rollback destroyed this cursor */
2633 }
2634 if( !pBt->inTrans || nKey+nData==0 ){
2635 /* Must start a transaction before doing an insert */
2636 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2637 }
2638 assert( !pBt->readOnly );
2639 if( !pCur->wrFlag ){
2640 return SQLITE_PERM; /* Cursor not open for writing */
2641 }
2642 if( checkReadLocks(pCur) ){
2643 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2644 }
2645 rc = fileBtreeMoveto(pCur, pKey, nKey, &loc);
2646 if( rc ) return rc;
2647 pPage = pCur->pPage;
2648 assert( pPage->isInit );
2649 rc = sqlitepager_write(pPage);
2650 if( rc ) return rc;
2651 rc = fillInCell(pBt, &newCell, pKey, nKey, pData, nData);
2652 if( rc ) return rc;
2653 szNew = cellSize(pBt, &newCell);
2654 if( loc==0 ){
2655 newCell.h.leftChild = pPage->apCell[pCur->idx]->h.leftChild;
2656 rc = clearCell(pBt, pPage->apCell[pCur->idx]);
2657 if( rc ) return rc;
2658 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pPage->apCell[pCur->idx]));
2659 }else if( loc<0 && pPage->nCell>0 ){
2660 assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */
2661 pCur->idx++;
2662 }else{
2663 assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */
2664 }
2665 insertCell(pBt, pPage, pCur->idx, &newCell, szNew);
2666 rc = balance(pCur->pBt, pPage, pCur);
2667 /* sqliteBtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
2668 /* fflush(stdout); */
2669 pCur->eSkip = SKIP_INVALID;
2670 return rc;
2671}
2672
2673/*
2674** Delete the entry that the cursor is pointing to.
2675**
2676** The cursor is left pointing at either the next or the previous
2677** entry. If the cursor is left pointing to the next entry, then
2678** the pCur->eSkip flag is set to SKIP_NEXT which forces the next call to
2679** sqliteBtreeNext() to be a no-op. That way, you can always call
2680** sqliteBtreeNext() after a delete and the cursor will be left
2681** pointing to the first entry after the deleted entry. Similarly,
2682** pCur->eSkip is set to SKIP_PREV is the cursor is left pointing to
2683** the entry prior to the deleted entry so that a subsequent call to
2684** sqliteBtreePrevious() will always leave the cursor pointing at the
2685** entry immediately before the one that was deleted.
2686*/
2687static int fileBtreeDelete(BtCursor *pCur){
2688 MemPage *pPage = pCur->pPage;
2689 Cell *pCell;
2690 int rc;
2691 Pgno pgnoChild;
2692 Btree *pBt = pCur->pBt;
2693
2694 assert( pPage->isInit );
2695 if( pCur->pPage==0 ){
2696 return SQLITE_ABORT; /* A rollback destroyed this cursor */
2697 }
2698 if( !pBt->inTrans ){
2699 /* Must start a transaction before doing a delete */
2700 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2701 }
2702 assert( !pBt->readOnly );
2703 if( pCur->idx >= pPage->nCell ){
2704 return SQLITE_ERROR; /* The cursor is not pointing to anything */
2705 }
2706 if( !pCur->wrFlag ){
2707 return SQLITE_PERM; /* Did not open this cursor for writing */
2708 }
2709 if( checkReadLocks(pCur) ){
2710 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2711 }
2712 rc = sqlitepager_write(pPage);
2713 if( rc ) return rc;
2714 pCell = pPage->apCell[pCur->idx];
2715 pgnoChild = SWAB32(pBt, pCell->h.leftChild);
2716 clearCell(pBt, pCell);
2717 if( pgnoChild ){
2718 /*
2719 ** The entry we are about to delete is not a leaf so if we do not
2720 ** do something we will leave a hole on an internal page.
2721 ** We have to fill the hole by moving in a cell from a leaf. The
2722 ** next Cell after the one to be deleted is guaranteed to exist and
2723 ** to be a leaf so we can use it.
2724 */
2725 BtCursor leafCur;
2726 Cell *pNext;
2727 int szNext;
2728 int notUsed;
2729 getTempCursor(pCur, &leafCur);
2730 rc = fileBtreeNext(&leafCur, &notUsed);
2731 if( rc!=SQLITE_OK ){
2732 if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
2733 return rc;
2734 }
2735 rc = sqlitepager_write(leafCur.pPage);
2736 if( rc ) return rc;
2737 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell));
2738 pNext = leafCur.pPage->apCell[leafCur.idx];
2739 szNext = cellSize(pBt, pNext);
2740 pNext->h.leftChild = SWAB32(pBt, pgnoChild);
2741 insertCell(pBt, pPage, pCur->idx, pNext, szNext);
2742 rc = balance(pBt, pPage, pCur);
2743 if( rc ) return rc;
2744 pCur->eSkip = SKIP_NEXT;
2745 dropCell(pBt, leafCur.pPage, leafCur.idx, szNext);
2746 rc = balance(pBt, leafCur.pPage, pCur);
2747 releaseTempCursor(&leafCur);
2748 }else{
2749 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell));
2750 if( pCur->idx>=pPage->nCell ){
2751 pCur->idx = pPage->nCell-1;
2752 if( pCur->idx<0 ){
2753 pCur->idx = 0;
2754 pCur->eSkip = SKIP_NEXT;
2755 }else{
2756 pCur->eSkip = SKIP_PREV;
2757 }
2758 }else{
2759 pCur->eSkip = SKIP_NEXT;
2760 }
2761 rc = balance(pBt, pPage, pCur);
2762 }
2763 return rc;
2764}
2765
2766/*
2767** Create a new BTree table. Write into *piTable the page
2768** number for the root page of the new table.
2769**
2770** In the current implementation, BTree tables and BTree indices are the
2771** the same. In the future, we may change this so that BTree tables
2772** are restricted to having a 4-byte integer key and arbitrary data and
2773** BTree indices are restricted to having an arbitrary key and no data.
2774** But for now, this routine also serves to create indices.
2775*/
2776static int fileBtreeCreateTable(Btree *pBt, int *piTable){
2777 MemPage *pRoot;
2778 Pgno pgnoRoot;
2779 int rc;
2780 if( !pBt->inTrans ){
2781 /* Must start a transaction first */
2782 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2783 }
2784 if( pBt->readOnly ){
2785 return SQLITE_READONLY;
2786 }
2787 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0);
2788 if( rc ) return rc;
2789 assert( sqlitepager_iswriteable(pRoot) );
2790 zeroPage(pBt, pRoot);
2791 sqlitepager_unref(pRoot);
2792 *piTable = (int)pgnoRoot;
2793 return SQLITE_OK;
2794}
2795
2796/*
2797** Erase the given database page and all its children. Return
2798** the page to the freelist.
2799*/
2800static int clearDatabasePage(Btree *pBt, Pgno pgno, int freePageFlag){
2801 MemPage *pPage;
2802 int rc;
2803 Cell *pCell;
2804 int idx;
2805
2806 rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pPage);
2807 if( rc ) return rc;
2808 rc = sqlitepager_write(pPage);
2809 if( rc ) return rc;
2810 rc = initPage(pBt, pPage, pgno, 0);
2811 if( rc ) return rc;
2812 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
2813 while( idx>0 ){
2814 pCell = (Cell*)&pPage->u.aDisk[idx];
2815 idx = SWAB16(pBt, pCell->h.iNext);
2816 if( pCell->h.leftChild ){
2817 rc = clearDatabasePage(pBt, SWAB32(pBt, pCell->h.leftChild), 1);
2818 if( rc ) return rc;
2819 }
2820 rc = clearCell(pBt, pCell);
2821 if( rc ) return rc;
2822 }
2823 if( pPage->u.hdr.rightChild ){
2824 rc = clearDatabasePage(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1);
2825 if( rc ) return rc;
2826 }
2827 if( freePageFlag ){
2828 rc = freePage(pBt, pPage, pgno);
2829 }else{
2830 zeroPage(pBt, pPage);
2831 }
2832 sqlitepager_unref(pPage);
2833 return rc;
2834}
2835
2836/*
2837** Delete all information from a single table in the database.
2838*/
2839static int fileBtreeClearTable(Btree *pBt, int iTable){
2840 int rc;
2841 BtCursor *pCur;
2842 if( !pBt->inTrans ){
2843 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2844 }
2845 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2846 if( pCur->pgnoRoot==(Pgno)iTable ){
2847 if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
2848 moveToRoot(pCur);
2849 }
2850 }
2851 rc = clearDatabasePage(pBt, (Pgno)iTable, 0);
2852 if( rc ){
2853 fileBtreeRollback(pBt);
2854 }
2855 return rc;
2856}
2857
2858/*
2859** Erase all information in a table and add the root of the table to
2860** the freelist. Except, the root of the principle table (the one on
2861** page 2) is never added to the freelist.
2862*/
2863static int fileBtreeDropTable(Btree *pBt, int iTable){
2864 int rc;
2865 MemPage *pPage;
2866 BtCursor *pCur;
2867 if( !pBt->inTrans ){
2868 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2869 }
2870 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2871 if( pCur->pgnoRoot==(Pgno)iTable ){
2872 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */
2873 }
2874 }
2875 rc = sqlitepager_get(pBt->pPager, (Pgno)iTable, (void**)&pPage);
2876 if( rc ) return rc;
2877 rc = fileBtreeClearTable(pBt, iTable);
2878 if( rc ) return rc;
2879 if( iTable>2 ){
2880 rc = freePage(pBt, pPage, iTable);
2881 }else{
2882 zeroPage(pBt, pPage);
2883 }
2884 sqlitepager_unref(pPage);
2885 return rc;
2886}
2887
2888#if 0 /* UNTESTED */
2889/*
2890** Copy all cell data from one database file into another.
2891** pages back the freelist.
2892*/
2893static int copyCell(Btree *pBtFrom, BTree *pBtTo, Cell *pCell){
2894 Pager *pFromPager = pBtFrom->pPager;
2895 OverflowPage *pOvfl;
2896 Pgno ovfl, nextOvfl;
2897 Pgno *pPrev;
2898 int rc = SQLITE_OK;
2899 MemPage *pNew, *pPrevPg;
2900 Pgno new;
2901
2902 if( NKEY(pBtTo, pCell->h) + NDATA(pBtTo, pCell->h) <= MX_LOCAL_PAYLOAD ){
2903 return SQLITE_OK;
2904 }
2905 pPrev = &pCell->ovfl;
2906 pPrevPg = 0;
2907 ovfl = SWAB32(pBtTo, pCell->ovfl);
2908 while( ovfl && rc==SQLITE_OK ){
2909 rc = sqlitepager_get(pFromPager, ovfl, (void**)&pOvfl);
2910 if( rc ) return rc;
2911 nextOvfl = SWAB32(pBtFrom, pOvfl->iNext);
2912 rc = allocatePage(pBtTo, &pNew, &new, 0);
2913 if( rc==SQLITE_OK ){
2914 rc = sqlitepager_write(pNew);
2915 if( rc==SQLITE_OK ){
2916 memcpy(pNew, pOvfl, SQLITE_USABLE_SIZE);
2917 *pPrev = SWAB32(pBtTo, new);
2918 if( pPrevPg ){
2919 sqlitepager_unref(pPrevPg);
2920 }
2921 pPrev = &pOvfl->iNext;
2922 pPrevPg = pNew;
2923 }
2924 }
2925 sqlitepager_unref(pOvfl);
2926 ovfl = nextOvfl;
2927 }
2928 if( pPrevPg ){
2929 sqlitepager_unref(pPrevPg);
2930 }
2931 return rc;
2932}
2933#endif
2934
2935
2936#if 0 /* UNTESTED */
2937/*
2938** Copy a page of data from one database over to another.
2939*/
2940static int copyDatabasePage(
2941 Btree *pBtFrom,
2942 Pgno pgnoFrom,
2943 Btree *pBtTo,
2944 Pgno *pTo
2945){
2946 MemPage *pPageFrom, *pPage;
2947 Pgno to;
2948 int rc;
2949 Cell *pCell;
2950 int idx;
2951
2952 rc = sqlitepager_get(pBtFrom->pPager, pgno, (void**)&pPageFrom);
2953 if( rc ) return rc;
2954 rc = allocatePage(pBt, &pPage, pTo, 0);
2955 if( rc==SQLITE_OK ){
2956 rc = sqlitepager_write(pPage);
2957 }
2958 if( rc==SQLITE_OK ){
2959 memcpy(pPage, pPageFrom, SQLITE_USABLE_SIZE);
2960 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
2961 while( idx>0 ){
2962 pCell = (Cell*)&pPage->u.aDisk[idx];
2963 idx = SWAB16(pBt, pCell->h.iNext);
2964 if( pCell->h.leftChild ){
2965 Pgno newChld;
2966 rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pCell->h.leftChild),
2967 pBtTo, &newChld);
2968 if( rc ) return rc;
2969 pCell->h.leftChild = SWAB32(pBtFrom, newChld);
2970 }
2971 rc = copyCell(pBtFrom, pBtTo, pCell);
2972 if( rc ) return rc;
2973 }
2974 if( pPage->u.hdr.rightChild ){
2975 Pgno newChld;
2976 rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pPage->u.hdr.rightChild),
2977 pBtTo, &newChld);
2978 if( rc ) return rc;
2979 pPage->u.hdr.rightChild = SWAB32(pBtTo, newChild);
2980 }
2981 }
2982 sqlitepager_unref(pPage);
2983 return rc;
2984}
2985#endif
2986
2987/*
2988** Read the meta-information out of a database file.
2989*/
2990static int fileBtreeGetMeta(Btree *pBt, int *aMeta){
2991 PageOne *pP1;
2992 int rc;
2993 int i;
2994
2995 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1);
2996 if( rc ) return rc;
2997 aMeta[0] = SWAB32(pBt, pP1->nFree);
2998 for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){
2999 aMeta[i+1] = SWAB32(pBt, pP1->aMeta[i]);
3000 }
3001 sqlitepager_unref(pP1);
3002 return SQLITE_OK;
3003}
3004
3005/*
3006** Write meta-information back into the database.
3007*/
3008static int fileBtreeUpdateMeta(Btree *pBt, int *aMeta){
3009 PageOne *pP1;
3010 int rc, i;
3011 if( !pBt->inTrans ){
3012 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3013 }
3014 pP1 = pBt->page1;
3015 rc = sqlitepager_write(pP1);
3016 if( rc ) return rc;
3017 for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){
3018 pP1->aMeta[i] = SWAB32(pBt, aMeta[i+1]);
3019 }
3020 return SQLITE_OK;
3021}
3022
3023/******************************************************************************
3024** The complete implementation of the BTree subsystem is above this line.
3025** All the code the follows is for testing and troubleshooting the BTree
3026** subsystem. None of the code that follows is used during normal operation.
3027******************************************************************************/
3028
3029/*
3030** Print a disassembly of the given page on standard output. This routine
3031** is used for debugging and testing only.
3032*/
3033#ifdef SQLITE_TEST
3034static int fileBtreePageDump(Btree *pBt, int pgno, int recursive){
3035 int rc;
3036 MemPage *pPage;
3037 int i, j;
3038 int nFree;
3039 u16 idx;
3040 char range[20];
3041 unsigned char payload[20];
3042 rc = sqlitepager_get(pBt->pPager, (Pgno)pgno, (void**)&pPage);
3043 if( rc ){
3044 return rc;
3045 }
3046 if( recursive ) printf("PAGE %d:\n", pgno);
3047 i = 0;
3048 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
3049 while( idx>0 && idx<=SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){
3050 Cell *pCell = (Cell*)&pPage->u.aDisk[idx];
3051 int sz = cellSize(pBt, pCell);
3052 sprintf(range,"%d..%d", idx, idx+sz-1);
3053 sz = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
3054 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
3055 memcpy(payload, pCell->aPayload, sz);
3056 for(j=0; j<sz; j++){
3057 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3058 }
3059 payload[sz] = 0;
3060 printf(
3061 "cell %2d: i=%-10s chld=%-4d nk=%-4d nd=%-4d payload=%s\n",
3062 i, range, (int)pCell->h.leftChild,
3063 NKEY(pBt, pCell->h), NDATA(pBt, pCell->h),
3064 payload
3065 );
3066 if( pPage->isInit && pPage->apCell[i]!=pCell ){
3067 printf("**** apCell[%d] does not match on prior entry ****\n", i);
3068 }
3069 i++;
3070 idx = SWAB16(pBt, pCell->h.iNext);
3071 }
3072 if( idx!=0 ){
3073 printf("ERROR: next cell index out of range: %d\n", idx);
3074 }
3075 printf("right_child: %d\n", SWAB32(pBt, pPage->u.hdr.rightChild));
3076 nFree = 0;
3077 i = 0;
3078 idx = SWAB16(pBt, pPage->u.hdr.firstFree);
3079 while( idx>0 && idx<SQLITE_USABLE_SIZE ){
3080 FreeBlk *p = (FreeBlk*)&pPage->u.aDisk[idx];
3081 sprintf(range,"%d..%d", idx, idx+p->iSize-1);
3082 nFree += SWAB16(pBt, p->iSize);
3083 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
3084 i, range, SWAB16(pBt, p->iSize), nFree);
3085 idx = SWAB16(pBt, p->iNext);
3086 i++;
3087 }
3088 if( idx!=0 ){
3089 printf("ERROR: next freeblock index out of range: %d\n", idx);
3090 }
3091 if( recursive && pPage->u.hdr.rightChild!=0 ){
3092 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
3093 while( idx>0 && idx<SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){
3094 Cell *pCell = (Cell*)&pPage->u.aDisk[idx];
3095 fileBtreePageDump(pBt, SWAB32(pBt, pCell->h.leftChild), 1);
3096 idx = SWAB16(pBt, pCell->h.iNext);
3097 }
3098 fileBtreePageDump(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1);
3099 }
3100 sqlitepager_unref(pPage);
3101 return SQLITE_OK;
3102}
3103#endif
3104
3105#ifdef SQLITE_TEST
3106/*
3107** Fill aResult[] with information about the entry and page that the
3108** cursor is pointing to.
3109**
3110** aResult[0] = The page number
3111** aResult[1] = The entry number
3112** aResult[2] = Total number of entries on this page
3113** aResult[3] = Size of this entry
3114** aResult[4] = Number of free bytes on this page
3115** aResult[5] = Number of free blocks on the page
3116** aResult[6] = Page number of the left child of this entry
3117** aResult[7] = Page number of the right child for the whole page
3118**
3119** This routine is used for testing and debugging only.
3120*/
3121static int fileBtreeCursorDump(BtCursor *pCur, int *aResult){
3122 int cnt, idx;
3123 MemPage *pPage = pCur->pPage;
3124 Btree *pBt = pCur->pBt;
3125 aResult[0] = sqlitepager_pagenumber(pPage);
3126 aResult[1] = pCur->idx;
3127 aResult[2] = pPage->nCell;
3128 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
3129 aResult[3] = cellSize(pBt, pPage->apCell[pCur->idx]);
3130 aResult[6] = SWAB32(pBt, pPage->apCell[pCur->idx]->h.leftChild);
3131 }else{
3132 aResult[3] = 0;
3133 aResult[6] = 0;
3134 }
3135 aResult[4] = pPage->nFree;
3136 cnt = 0;
3137 idx = SWAB16(pBt, pPage->u.hdr.firstFree);
3138 while( idx>0 && idx<SQLITE_USABLE_SIZE ){
3139 cnt++;
3140 idx = SWAB16(pBt, ((FreeBlk*)&pPage->u.aDisk[idx])->iNext);
3141 }
3142 aResult[5] = cnt;
3143 aResult[7] = SWAB32(pBt, pPage->u.hdr.rightChild);
3144 return SQLITE_OK;
3145}
3146#endif
3147
3148/*
3149** Return the pager associated with a BTree. This routine is used for
3150** testing and debugging only.
3151*/
3152static Pager *fileBtreePager(Btree *pBt){
3153 return pBt->pPager;
3154}
3155
3156/*
3157** This structure is passed around through all the sanity checking routines
3158** in order to keep track of some global state information.
3159*/
3160typedef struct IntegrityCk IntegrityCk;
3161struct IntegrityCk {
3162 Btree *pBt; /* The tree being checked out */
3163 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
3164 int nPage; /* Number of pages in the database */
3165 int *anRef; /* Number of times each page is referenced */
3166 char *zErrMsg; /* An error message. NULL of no errors seen. */
3167};
3168
3169/*
3170** Append a message to the error message string.
3171*/
3172static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
3173 if( pCheck->zErrMsg ){
3174 char *zOld = pCheck->zErrMsg;
3175 pCheck->zErrMsg = 0;
3176 sqliteSetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
3177 sqliteFree(zOld);
3178 }else{
3179 sqliteSetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
3180 }
3181}
3182
3183/*
3184** Add 1 to the reference count for page iPage. If this is the second
3185** reference to the page, add an error message to pCheck->zErrMsg.
3186** Return 1 if there are 2 ore more references to the page and 0 if
3187** if this is the first reference to the page.
3188**
3189** Also check that the page number is in bounds.
3190*/
3191static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
3192 if( iPage==0 ) return 1;
3193 if( iPage>pCheck->nPage || iPage<0 ){
3194 char zBuf[100];
3195 sprintf(zBuf, "invalid page number %d", iPage);
3196 checkAppendMsg(pCheck, zContext, zBuf);
3197 return 1;
3198 }
3199 if( pCheck->anRef[iPage]==1 ){
3200 char zBuf[100];
3201 sprintf(zBuf, "2nd reference to page %d", iPage);
3202 checkAppendMsg(pCheck, zContext, zBuf);
3203 return 1;
3204 }
3205 return (pCheck->anRef[iPage]++)>1;
3206}
3207
3208/*
3209** Check the integrity of the freelist or of an overflow page list.
3210** Verify that the number of pages on the list is N.
3211*/
3212static void checkList(
3213 IntegrityCk *pCheck, /* Integrity checking context */
3214 int isFreeList, /* True for a freelist. False for overflow page list */
3215 int iPage, /* Page number for first page in the list */
3216 int N, /* Expected number of pages in the list */
3217 char *zContext /* Context for error messages */
3218){
3219 int i;
3220 char zMsg[100];
3221 while( N-- > 0 ){
3222 OverflowPage *pOvfl;
3223 if( iPage<1 ){
3224 sprintf(zMsg, "%d pages missing from overflow list", N+1);
3225 checkAppendMsg(pCheck, zContext, zMsg);
3226 break;
3227 }
3228 if( checkRef(pCheck, iPage, zContext) ) break;
3229 if( sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
3230 sprintf(zMsg, "failed to get page %d", iPage);
3231 checkAppendMsg(pCheck, zContext, zMsg);
3232 break;
3233 }
3234 if( isFreeList ){
3235 FreelistInfo *pInfo = (FreelistInfo*)pOvfl->aPayload;
3236 int n = SWAB32(pCheck->pBt, pInfo->nFree);
3237 for(i=0; i<n; i++){
3238 checkRef(pCheck, SWAB32(pCheck->pBt, pInfo->aFree[i]), zContext);
3239 }
3240 N -= n;
3241 }
3242 iPage = SWAB32(pCheck->pBt, pOvfl->iNext);
3243 sqlitepager_unref(pOvfl);
3244 }
3245}
3246
3247/*
3248** Return negative if zKey1<zKey2.
3249** Return zero if zKey1==zKey2.
3250** Return positive if zKey1>zKey2.
3251*/
3252static int keyCompare(
3253 const char *zKey1, int nKey1,
3254 const char *zKey2, int nKey2
3255){
3256 int min = nKey1>nKey2 ? nKey2 : nKey1;
3257 int c = memcmp(zKey1, zKey2, min);
3258 if( c==0 ){
3259 c = nKey1 - nKey2;
3260 }
3261 return c;
3262}
3263
3264/*
3265** Do various sanity checks on a single page of a tree. Return
3266** the tree depth. Root pages return 0. Parents of root pages
3267** return 1, and so forth.
3268**
3269** These checks are done:
3270**
3271** 1. Make sure that cells and freeblocks do not overlap
3272** but combine to completely cover the page.
3273** 2. Make sure cell keys are in order.
3274** 3. Make sure no key is less than or equal to zLowerBound.
3275** 4. Make sure no key is greater than or equal to zUpperBound.
3276** 5. Check the integrity of overflow pages.
3277** 6. Recursively call checkTreePage on all children.
3278** 7. Verify that the depth of all children is the same.
3279** 8. Make sure this page is at least 33% full or else it is
3280** the root of the tree.
3281*/
3282static int checkTreePage(
3283 IntegrityCk *pCheck, /* Context for the sanity check */
3284 int iPage, /* Page number of the page to check */
3285 MemPage *pParent, /* Parent page */
3286 char *zParentContext, /* Parent context */
3287 char *zLowerBound, /* All keys should be greater than this, if not NULL */
3288 int nLower, /* Number of characters in zLowerBound */
3289 char *zUpperBound, /* All keys should be less than this, if not NULL */
3290 int nUpper /* Number of characters in zUpperBound */
3291){
3292 MemPage *pPage;
3293 int i, rc, depth, d2, pgno;
3294 char *zKey1, *zKey2;
3295 int nKey1, nKey2;
3296 BtCursor cur;
3297 Btree *pBt;
3298 char zMsg[100];
3299 char zContext[100];
3300 char hit[SQLITE_USABLE_SIZE];
3301
3302 /* Check that the page exists
3303 */
3304 cur.pBt = pBt = pCheck->pBt;
3305 if( iPage==0 ) return 0;
3306 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
3307 sprintf(zContext, "On tree page %d: ", iPage);
3308 if( (rc = sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pPage))!=0 ){
3309 sprintf(zMsg, "unable to get the page. error code=%d", rc);
3310 checkAppendMsg(pCheck, zContext, zMsg);
3311 return 0;
3312 }
3313 if( (rc = initPage(pBt, pPage, (Pgno)iPage, pParent))!=0 ){
3314 sprintf(zMsg, "initPage() returns error code %d", rc);
3315 checkAppendMsg(pCheck, zContext, zMsg);
3316 sqlitepager_unref(pPage);
3317 return 0;
3318 }
3319
3320 /* Check out all the cells.
3321 */
3322 depth = 0;
3323 if( zLowerBound ){
3324 zKey1 = sqliteMalloc( nLower+1 );
3325 memcpy(zKey1, zLowerBound, nLower);
3326 zKey1[nLower] = 0;
3327 }else{
3328 zKey1 = 0;
3329 }
3330 nKey1 = nLower;
3331 cur.pPage = pPage;
3332 for(i=0; i<pPage->nCell; i++){
3333 Cell *pCell = pPage->apCell[i];
3334 int sz;
3335
3336 /* Check payload overflow pages
3337 */
3338 nKey2 = NKEY(pBt, pCell->h);
3339 sz = nKey2 + NDATA(pBt, pCell->h);
3340 sprintf(zContext, "On page %d cell %d: ", iPage, i);
3341 if( sz>MX_LOCAL_PAYLOAD ){
3342 int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE;
3343 checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext);
3344 }
3345
3346 /* Check that keys are in the right order
3347 */
3348 cur.idx = i;
3349 zKey2 = sqliteMallocRaw( nKey2+1 );
3350 getPayload(&cur, 0, nKey2, zKey2);
3351 if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){
3352 checkAppendMsg(pCheck, zContext, "Key is out of order");
3353 }
3354
3355 /* Check sanity of left child page.
3356 */
3357 pgno = SWAB32(pBt, pCell->h.leftChild);
3358 d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2);
3359 if( i>0 && d2!=depth ){
3360 checkAppendMsg(pCheck, zContext, "Child page depth differs");
3361 }
3362 depth = d2;
3363 sqliteFree(zKey1);
3364 zKey1 = zKey2;
3365 nKey1 = nKey2;
3366 }
3367 pgno = SWAB32(pBt, pPage->u.hdr.rightChild);
3368 sprintf(zContext, "On page %d at right child: ", iPage);
3369 checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper);
3370 sqliteFree(zKey1);
3371
3372 /* Check for complete coverage of the page
3373 */
3374 memset(hit, 0, sizeof(hit));
3375 memset(hit, 1, sizeof(PageHdr));
3376 for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && i<SQLITE_USABLE_SIZE; ){
3377 Cell *pCell = (Cell*)&pPage->u.aDisk[i];
3378 int j;
3379 for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++;
3380 i = SWAB16(pBt, pCell->h.iNext);
3381 }
3382 for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && i<SQLITE_USABLE_SIZE; ){
3383 FreeBlk *pFBlk = (FreeBlk*)&pPage->u.aDisk[i];
3384 int j;
3385 for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++;
3386 i = SWAB16(pBt,pFBlk->iNext);
3387 }
3388 for(i=0; i<SQLITE_USABLE_SIZE; i++){
3389 if( hit[i]==0 ){
3390 sprintf(zMsg, "Unused space at byte %d of page %d", i, iPage);
3391 checkAppendMsg(pCheck, zMsg, 0);
3392 break;
3393 }else if( hit[i]>1 ){
3394 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3395 checkAppendMsg(pCheck, zMsg, 0);
3396 break;
3397 }
3398 }
3399
3400 /* Check that free space is kept to a minimum
3401 */
3402#if 0
3403 if( pParent && pParent->nCell>2 && pPage->nFree>3*SQLITE_USABLE_SIZE/4 ){
3404 sprintf(zMsg, "free space (%d) greater than max (%d)", pPage->nFree,
3405 SQLITE_USABLE_SIZE/3);
3406 checkAppendMsg(pCheck, zContext, zMsg);
3407 }
3408#endif
3409
3410 sqlitepager_unref(pPage);
3411 return depth;
3412}
3413
3414/*
3415** This routine does a complete check of the given BTree file. aRoot[] is
3416** an array of pages numbers were each page number is the root page of
3417** a table. nRoot is the number of entries in aRoot.
3418**
3419** If everything checks out, this routine returns NULL. If something is
3420** amiss, an error message is written into memory obtained from malloc()
3421** and a pointer to that error message is returned. The calling function
3422** is responsible for freeing the error message when it is done.
3423*/
3424char *fileBtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
3425 int i;
3426 int nRef;
3427 IntegrityCk sCheck;
3428
3429 nRef = *sqlitepager_stats(pBt->pPager);
3430 if( lockBtree(pBt)!=SQLITE_OK ){
3431 return sqliteStrDup("Unable to acquire a read lock on the database");
3432 }
3433 sCheck.pBt = pBt;
3434 sCheck.pPager = pBt->pPager;
3435 sCheck.nPage = sqlitepager_pagecount(sCheck.pPager);
3436 if( sCheck.nPage==0 ){
3437 unlockBtreeIfUnused(pBt);
3438 return 0;
3439 }
3440 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
3441 sCheck.anRef[1] = 1;
3442 for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
3443 sCheck.zErrMsg = 0;
3444
3445 /* Check the integrity of the freelist
3446 */
3447 checkList(&sCheck, 1, SWAB32(pBt, pBt->page1->freeList),
3448 SWAB32(pBt, pBt->page1->nFree), "Main freelist: ");
3449
3450 /* Check all the tables.
3451 */
3452 for(i=0; i<nRoot; i++){
3453 if( aRoot[i]==0 ) continue;
3454 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
3455 }
3456
3457 /* Make sure every page in the file is referenced
3458 */
3459 for(i=1; i<=sCheck.nPage; i++){
3460 if( sCheck.anRef[i]==0 ){
3461 char zBuf[100];
3462 sprintf(zBuf, "Page %d is never used", i);
3463 checkAppendMsg(&sCheck, zBuf, 0);
3464 }
3465 }
3466
3467 /* Make sure this analysis did not leave any unref() pages
3468 */
3469 unlockBtreeIfUnused(pBt);
3470 if( nRef != *sqlitepager_stats(pBt->pPager) ){
3471 char zBuf[100];
3472 sprintf(zBuf,
3473 "Outstanding page count goes from %d to %d during this analysis",
3474 nRef, *sqlitepager_stats(pBt->pPager)
3475 );
3476 checkAppendMsg(&sCheck, zBuf, 0);
3477 }
3478
3479 /* Clean up and report errors.
3480 */
3481 sqliteFree(sCheck.anRef);
3482 return sCheck.zErrMsg;
3483}
3484
3485/*
3486** Return the full pathname of the underlying database file.
3487*/
3488static const char *fileBtreeGetFilename(Btree *pBt){
3489 assert( pBt->pPager!=0 );
3490 return sqlitepager_filename(pBt->pPager);
3491}
3492
3493/*
3494** Copy the complete content of pBtFrom into pBtTo. A transaction
3495** must be active for both files.
3496**
3497** The size of file pBtFrom may be reduced by this operation.
3498** If anything goes wrong, the transaction on pBtFrom is rolled back.
3499*/
3500static int fileBtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
3501 int rc = SQLITE_OK;
3502 Pgno i, nPage, nToPage;
3503
3504 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
3505 if( pBtTo->needSwab!=pBtFrom->needSwab ) return SQLITE_ERROR;
3506 if( pBtTo->pCursor ) return SQLITE_BUSY;
3507 memcpy(pBtTo->page1, pBtFrom->page1, SQLITE_USABLE_SIZE);
3508 rc = sqlitepager_overwrite(pBtTo->pPager, 1, pBtFrom->page1);
3509 nToPage = sqlitepager_pagecount(pBtTo->pPager);
3510 nPage = sqlitepager_pagecount(pBtFrom->pPager);
3511 for(i=2; rc==SQLITE_OK && i<=nPage; i++){
3512 void *pPage;
3513 rc = sqlitepager_get(pBtFrom->pPager, i, &pPage);
3514 if( rc ) break;
3515 rc = sqlitepager_overwrite(pBtTo->pPager, i, pPage);
3516 if( rc ) break;
3517 sqlitepager_unref(pPage);
3518 }
3519 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
3520 void *pPage;
3521 rc = sqlitepager_get(pBtTo->pPager, i, &pPage);
3522 if( rc ) break;
3523 rc = sqlitepager_write(pPage);
3524 sqlitepager_unref(pPage);
3525 sqlitepager_dont_write(pBtTo->pPager, i);
3526 }
3527 if( !rc && nPage<nToPage ){
3528 rc = sqlitepager_truncate(pBtTo->pPager, nPage);
3529 }
3530 if( rc ){
3531 fileBtreeRollback(pBtTo);
3532 }
3533 return rc;
3534}
3535
3536/*
3537** The following tables contain pointers to all of the interface
3538** routines for this implementation of the B*Tree backend. To
3539** substitute a different implemention of the backend, one has merely
3540** to provide pointers to alternative functions in similar tables.
3541*/
3542static BtOps sqliteBtreeOps = {
3543 fileBtreeClose,
3544 fileBtreeSetCacheSize,
3545 fileBtreeSetSafetyLevel,
3546 fileBtreeBeginTrans,
3547 fileBtreeCommit,
3548 fileBtreeRollback,
3549 fileBtreeBeginCkpt,
3550 fileBtreeCommitCkpt,
3551 fileBtreeRollbackCkpt,
3552 fileBtreeCreateTable,
3553 fileBtreeCreateTable, /* Really sqliteBtreeCreateIndex() */
3554 fileBtreeDropTable,
3555 fileBtreeClearTable,
3556 fileBtreeCursor,
3557 fileBtreeGetMeta,
3558 fileBtreeUpdateMeta,
3559 fileBtreeIntegrityCheck,
3560 fileBtreeGetFilename,
3561 fileBtreeCopyFile,
3562 fileBtreePager,
3563#ifdef SQLITE_TEST
3564 fileBtreePageDump,
3565#endif
3566};
3567static BtCursorOps sqliteBtreeCursorOps = {
3568 fileBtreeMoveto,
3569 fileBtreeDelete,
3570 fileBtreeInsert,
3571 fileBtreeFirst,
3572 fileBtreeLast,
3573 fileBtreeNext,
3574 fileBtreePrevious,
3575 fileBtreeKeySize,
3576 fileBtreeKey,
3577 fileBtreeKeyCompare,
3578 fileBtreeDataSize,
3579 fileBtreeData,
3580 fileBtreeCloseCursor,
3581#ifdef SQLITE_TEST
3582 fileBtreeCursorDump,
3583#endif
3584};
Note: See TracBrowser for help on using the repository browser.