001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.kahadb.page;
018    
019    import java.io.ByteArrayInputStream;
020    import java.io.ByteArrayOutputStream;
021    import java.io.DataInputStream;
022    import java.io.DataOutputStream;
023    import java.io.File;
024    import java.io.FileInputStream;
025    import java.io.FileOutputStream;
026    import java.io.IOException;
027    import java.io.InterruptedIOException;
028    import java.io.RandomAccessFile;
029    import java.util.ArrayList;
030    import java.util.Arrays;
031    import java.util.Collection;
032    import java.util.Iterator;
033    import java.util.LinkedHashMap;
034    import java.util.Map;
035    import java.util.Properties;
036    import java.util.TreeMap;
037    import java.util.Map.Entry;
038    import java.util.concurrent.CountDownLatch;
039    import java.util.concurrent.atomic.AtomicBoolean;
040    import java.util.concurrent.atomic.AtomicLong;
041    import java.util.zip.Adler32;
042    import java.util.zip.Checksum;
043    
044    import org.apache.commons.logging.Log;
045    import org.apache.commons.logging.LogFactory;
046    import org.apache.kahadb.util.DataByteArrayOutputStream;
047    import org.apache.kahadb.util.IOHelper;
048    import org.apache.kahadb.util.IntrospectionSupport;
049    import org.apache.kahadb.util.LRUCache;
050    import org.apache.kahadb.util.Sequence;
051    import org.apache.kahadb.util.SequenceSet;
052    
053    /**
054     * A PageFile provides you random access to fixed sized disk pages. This object is not thread safe and therefore access to it should 
055     * be externally synchronized.
056     * 
057     * The file has 3 parts:
058     * Metadata Space: 4k : Reserved metadata area. Used to store persistent config about the file.
059     * Recovery Buffer Space: Page Size * 1000 : This is a redo log used to prevent partial page writes from making the file inconsistent
060     * Page Space: The pages in the page file.
061     * 
062     * @version $Revision: 814649 $
063     */
064    public class PageFile {
065        
066        private static final String PAGEFILE_SUFFIX = ".data";
067        private static final String RECOVERY_FILE_SUFFIX = ".redo";
068        private static final String FREE_FILE_SUFFIX = ".free";
069        
070        // 4k Default page size.
071        public static final int DEFAULT_PAGE_SIZE = Integer.parseInt(System.getProperty("defaultPageSize", ""+1024*4)); 
072        public static final int DEFAULT_WRITE_BATCH_SIZE = Integer.parseInt(System.getProperty("defaultWriteBatchSize", ""+1000));
073        private static final int RECOVERY_FILE_HEADER_SIZE=1024*4;
074        private static final int PAGE_FILE_HEADER_SIZE=1024*4;
075    
076        // Recovery header is (long offset)
077        private static final Log LOG = LogFactory.getLog(PageFile.class);
078    
079        // A PageFile will use a couple of files in this directory
080        private File directory;
081        // And the file names in that directory will be based on this name.
082        private final String name;
083        
084        // File handle used for reading pages..
085        private RandomAccessFile readFile;
086        // File handle used for writing pages..
087        private RandomAccessFile writeFile;
088        // File handle used for writing pages..
089        private RandomAccessFile recoveryFile;
090    
091        // The size of pages
092        private int pageSize = DEFAULT_PAGE_SIZE;
093        
094        // The minimum number of space allocated to the recovery file in number of pages.
095        private int recoveryFileMinPageCount = 1000;
096        // The max size that we let the recovery file grow to.. ma exceed the max, but the file will get resize 
097        // to this max size as soon as  possible.
098        private int recoveryFileMaxPageCount = 10000;
099        // The number of pages in the current recovery buffer
100        private int recoveryPageCount;
101    
102        private AtomicBoolean loaded = new AtomicBoolean();
103        // The number of pages we are aiming to write every time we 
104        // write to disk.
105        int writeBatchSize = DEFAULT_WRITE_BATCH_SIZE;
106    
107        // We keep a cache of pages recently used?
108        private LRUCache<Long, Page> pageCache;
109        // The cache of recently used pages.
110        private boolean enablePageCaching=true;
111        // How many pages will we keep in the cache?
112        private int pageCacheSize = 100;
113        
114        // Should first log the page write to the recovery buffer? Avoids partial
115        // page write failures..
116        private boolean enableRecoveryFile=true;
117        // Will we sync writes to disk. Ensures that data will not be lost after a checkpoint()
118        private boolean enableDiskSyncs=true;
119        // Will writes be done in an async thread?
120        private boolean enabledWriteThread=false;
121    
122        // These are used if enableAsyncWrites==true 
123        private AtomicBoolean stopWriter = new AtomicBoolean();
124        private Thread writerThread;
125        private CountDownLatch checkpointLatch;
126    
127        // Keeps track of writes that are being written to disk.
128        private TreeMap<Long, PageWrite> writes=new TreeMap<Long, PageWrite>();
129    
130        // Keeps track of free pages.
131        private final AtomicLong nextFreePageId = new AtomicLong();
132        private SequenceSet freeList = new SequenceSet();
133        
134        private AtomicLong nextTxid = new AtomicLong();
135        
136        // Persistent settings stored in the page file. 
137        private MetaData metaData;
138        
139        /**
140         * Use to keep track of updated pages which have not yet been committed.
141         */
142        static class PageWrite {
143            Page page;
144            byte[] current;
145            byte[] diskBound;
146    
147            public PageWrite(Page page, byte[] data) {
148                this.page=page;
149                current=data;
150            }
151                    
152            public void setCurrent(Page page, byte[] data) {
153                this.page=page;
154                current=data;
155            }
156    
157            @Override
158            public String toString() {
159                return "[PageWrite:"+page.getPageId()+"]";
160            }
161    
162            @SuppressWarnings("unchecked")
163            public Page getPage() {
164                return page;
165            }
166            
167            void begin() {
168                diskBound = current;
169                current = null;
170            }
171            
172            /**
173             * @return true if there is no pending writes to do.
174             */
175            boolean done() {
176                diskBound=null;
177                return current == null;
178            }
179    
180        }
181        
182        /**
183         * The MetaData object hold the persistent data associated with a PageFile object. 
184         */
185        public static class MetaData {
186            
187            String fileType;
188            String fileTypeVersion;
189            
190            long metaDataTxId=-1;
191            int pageSize;
192            boolean cleanShutdown;
193            long lastTxId;
194            long freePages;
195            
196            public String getFileType() {
197                return fileType;
198            }
199            public void setFileType(String fileType) {
200                this.fileType = fileType;
201            }
202            public String getFileTypeVersion() {
203                return fileTypeVersion;
204            }
205            public void setFileTypeVersion(String version) {
206                this.fileTypeVersion = version;
207            }
208            public long getMetaDataTxId() {
209                return metaDataTxId;
210            }
211            public void setMetaDataTxId(long metaDataTxId) {
212                this.metaDataTxId = metaDataTxId;
213            }
214            public int getPageSize() {
215                return pageSize;
216            }
217            public void setPageSize(int pageSize) {
218                this.pageSize = pageSize;
219            }
220            public boolean isCleanShutdown() {
221                return cleanShutdown;
222            }
223            public void setCleanShutdown(boolean cleanShutdown) {
224                this.cleanShutdown = cleanShutdown;
225            }
226            public long getLastTxId() {
227                return lastTxId;
228            }
229            public void setLastTxId(long lastTxId) {
230                this.lastTxId = lastTxId;
231            }
232            public long getFreePages() {
233                return freePages;
234            }
235            public void setFreePages(long value) {
236                this.freePages = value;
237            }
238        }
239    
240        public Transaction tx() {
241            assertLoaded();
242            return new Transaction(this);
243        }
244        
245        /**
246         * Creates a PageFile in the specified directory who's data files are named by name.
247         * 
248         * @param directory
249         * @param name
250         */
251        public PageFile(File directory, String name) {
252            this.directory = directory;
253            this.name = name;
254        }
255        
256        /**
257         * Deletes the files used by the PageFile object.  This method can only be used when this object is not loaded.
258         * 
259         * @throws IOException 
260         *         if the files cannot be deleted.
261         * @throws IllegalStateException 
262         *         if this PageFile is loaded
263         */
264        public void delete() throws IOException {
265            if( loaded.get() ) {
266                throw new IllegalStateException("Cannot delete page file data when the page file is loaded");
267            }
268            delete(getMainPageFile());
269            delete(getFreeFile());
270            delete(getRecoveryFile());
271        }
272    
273        /**
274         * @param file
275         * @throws IOException
276         */
277        private void delete(File file) throws IOException {
278            if( file.exists() ) {
279                if( !file.delete() ) {
280                    throw new IOException("Could not delete: "+file.getPath());
281                }
282            }
283        }
284        
285        /**
286         * Loads the page file so that it can be accessed for read/write purposes.  This allocates OS resources.  If this is the 
287         * first time the page file is loaded, then this creates the page file in the file system.
288         * 
289         * @throws IOException
290         *         If the page file cannot be loaded. This could be cause the existing page file is corrupt is a bad version or if 
291         *         there was a disk error.
292         * @throws IllegalStateException 
293         *         If the page file was already loaded.
294         */
295        public void load() throws IOException, IllegalStateException {
296            if (loaded.compareAndSet(false, true)) {
297                
298                if( enablePageCaching ) {
299                    pageCache = new LRUCache<Long, Page>(pageCacheSize, pageCacheSize, 0.75f, true);
300                }
301                
302                File file = getMainPageFile();
303                IOHelper.mkdirs(file.getParentFile());
304                writeFile = new RandomAccessFile(file, "rw");
305                readFile = new RandomAccessFile(file, "r");
306                
307                if (readFile.length() > 0) {
308                    // Load the page size setting cause that can't change once the file is created.
309                    loadMetaData();
310                    pageSize = metaData.getPageSize();
311                } else {
312                    // Store the page size setting cause that can't change once the file is created.
313                    metaData = new MetaData();
314                    metaData.setFileType(PageFile.class.getName());
315                    metaData.setFileTypeVersion("1");
316                    metaData.setPageSize(getPageSize());
317                    metaData.setCleanShutdown(true);
318                    metaData.setFreePages(-1);
319                    metaData.setLastTxId(0);
320                    storeMetaData();
321                }
322    
323                if( enableRecoveryFile ) {
324                    recoveryFile = new RandomAccessFile(getRecoveryFile(), "rw");
325                }
326                
327                if(  metaData.isCleanShutdown() ) {
328                    nextTxid.set(metaData.getLastTxId()+1);
329                    if( metaData.getFreePages()>0 ) {
330                        loadFreeList();
331                    } 
332                } else {
333                    LOG.debug("Recovering page file...");
334                    nextTxid.set(redoRecoveryUpdates());
335                    
336                    // Scan all to find the free pages.
337                    freeList = new SequenceSet();
338                    for (Iterator i = tx().iterator(true); i.hasNext();) {
339                        Page page = (Page)i.next();
340                        if( page.getType() == Page.PAGE_FREE_TYPE ) {
341                            freeList.add(page.getPageId());
342                        }
343                    }
344                    
345                }
346                
347                metaData.setCleanShutdown(false);
348                storeMetaData();
349                getFreeFile().delete();
350                
351                if( writeFile.length() < PAGE_FILE_HEADER_SIZE) {
352                    writeFile.setLength(PAGE_FILE_HEADER_SIZE);
353                }
354                nextFreePageId.set((writeFile.length()-PAGE_FILE_HEADER_SIZE)/pageSize);
355                startWriter();
356                    
357            } else {
358                throw new IllegalStateException("Cannot load the page file when it is allready loaded.");
359            }
360        }
361    
362    
363        /**
364         * Unloads a previously loaded PageFile.  This deallocates OS related resources like file handles.
365         * once unloaded, you can no longer use the page file to read or write Pages.
366         * 
367         * @throws IOException
368         *         if there was a disk error occurred while closing the down the page file.
369         * @throws IllegalStateException
370         *         if the PageFile is not loaded
371         */
372        public void unload() throws IOException {
373            if (loaded.compareAndSet(true, false)) {
374                flush();
375                try {
376                    stopWriter();
377                } catch (InterruptedException e) {
378                    throw new InterruptedIOException();
379                }
380                
381                if( freeList.isEmpty() ) {
382                    metaData.setFreePages(0);
383                } else {
384                    storeFreeList();
385                    metaData.setFreePages(freeList.size());
386                }
387                
388                metaData.setLastTxId( nextTxid.get()-1 );
389                metaData.setCleanShutdown(true);
390                storeMetaData();
391                
392                if (readFile != null) {
393                    readFile.close();
394                    readFile = null;
395                    writeFile.close();
396                    writeFile=null;
397                    if( enableRecoveryFile ) {
398                        recoveryFile.close();
399                        recoveryFile=null;
400                    }
401                    freeList.clear();
402                    if( pageCache!=null ) {
403                        pageCache=null;
404                    }
405                    synchronized(writes) {
406                        writes.clear();
407                    }
408                }
409            } else {
410                throw new IllegalStateException("Cannot unload the page file when it is not loaded");
411            }
412        }
413            
414        public boolean isLoaded() {
415            return loaded.get();
416        }
417    
418        /**
419         * Flush and sync all write buffers to disk.
420         * 
421         * @throws IOException
422         *         If an disk error occurred.
423         */
424        public void flush() throws IOException {
425    
426            if( enabledWriteThread && stopWriter.get() ) {
427                throw new IOException("Page file already stopped: checkpointing is not allowed");
428            }
429            
430            // Setup a latch that gets notified when all buffered writes hits the disk.
431            CountDownLatch checkpointLatch;
432            synchronized( writes ) {
433                if( writes.isEmpty()) {                
434                    return;
435                }
436                if( enabledWriteThread ) {
437                    if( this.checkpointLatch == null ) {
438                        this.checkpointLatch = new CountDownLatch(1);
439                    }
440                    checkpointLatch = this.checkpointLatch;
441                    writes.notify();
442                } else {
443                    writeBatch();
444                    return;
445                }
446            }
447            try {
448                int size = writes.size();
449                long start = System.currentTimeMillis();
450                checkpointLatch.await();        
451                long end = System.currentTimeMillis();
452                if( end-start > 100 ) {
453                    LOG.warn("KahaDB PageFile flush: " + size + " queued writes, latch wait took "+(end-start));
454                }
455            } catch (InterruptedException e) {
456                throw new InterruptedIOException();
457            }
458        }
459    
460        
461        public String toString() {
462            return "Page File: "+getMainPageFile();
463        }
464        
465        ///////////////////////////////////////////////////////////////////
466        // Private Implementation Methods
467        ///////////////////////////////////////////////////////////////////
468        private File getMainPageFile() {
469            return new File(directory, IOHelper.toFileSystemSafeName(name)+PAGEFILE_SUFFIX);
470        }
471        
472        public File getFreeFile() {
473            return new File(directory, IOHelper.toFileSystemSafeName(name)+FREE_FILE_SUFFIX);
474        } 
475    
476        public File getRecoveryFile() {
477            return new File(directory, IOHelper.toFileSystemSafeName(name)+RECOVERY_FILE_SUFFIX);
478        } 
479    
480        private long toOffset(long pageId) {
481            return PAGE_FILE_HEADER_SIZE+(pageId*pageSize);
482        }
483    
484        private void loadMetaData() throws IOException {
485    
486            ByteArrayInputStream is;
487            MetaData v1 = new MetaData();
488            MetaData v2 = new MetaData();
489            try {
490                Properties p = new Properties();
491                byte[] d = new byte[PAGE_FILE_HEADER_SIZE/2];
492                readFile.seek(0);
493                readFile.readFully(d);
494                is = new ByteArrayInputStream(d);
495                p.load(is);
496                IntrospectionSupport.setProperties(v1, p);
497            } catch (IOException e) {
498                v1 = null;
499            }
500            
501            try {
502                Properties p = new Properties();
503                byte[] d = new byte[PAGE_FILE_HEADER_SIZE/2];
504                readFile.seek(PAGE_FILE_HEADER_SIZE/2);
505                readFile.readFully(d);
506                is = new ByteArrayInputStream(d);
507                p.load(is);
508                IntrospectionSupport.setProperties(v2, p);
509            } catch (IOException e) {
510                v2 = null;
511            }
512            
513            if( v1==null && v2==null ) {
514                throw new IOException("Could not load page file meta data");
515            } 
516            
517            if( v1 == null || v1.metaDataTxId<0 ) {
518                metaData = v2;
519            } else if( v2==null || v1.metaDataTxId<0 ) {
520                metaData = v1;
521            } else if( v1.metaDataTxId==v2.metaDataTxId ) {
522                metaData = v1; // use the first since the 2nd could be a partial..
523            } else {
524                metaData = v2; // use the second cause the first is probably a partial.
525            }
526        }
527        
528        private void storeMetaData() throws IOException {
529            // Convert the metadata into a property format
530            metaData.metaDataTxId++;
531            Properties p = new Properties();
532            IntrospectionSupport.getProperties(metaData, p, null);
533            
534            ByteArrayOutputStream os = new ByteArrayOutputStream(PAGE_FILE_HEADER_SIZE);
535            p.store(os, "");
536            if( os.size() > PAGE_FILE_HEADER_SIZE/2) { 
537                throw new IOException("Configuation is to larger than: "+PAGE_FILE_HEADER_SIZE/2);
538            }
539            // Fill the rest with space...
540            byte[] filler = new byte[(PAGE_FILE_HEADER_SIZE/2)-os.size()];
541            Arrays.fill(filler, (byte)' ');
542            os.write(filler);
543            os.flush();
544            
545            byte[] d = os.toByteArray();
546    
547            // So we don't loose it.. write it 2 times...
548            writeFile.seek(0);
549            writeFile.write(d);
550            writeFile.getFD().sync();
551            writeFile.seek(PAGE_FILE_HEADER_SIZE/2);
552            writeFile.write(d);
553            writeFile.getFD().sync();
554        }
555    
556        private void storeFreeList() throws IOException {
557            FileOutputStream os = new FileOutputStream(getFreeFile());
558            DataOutputStream dos = new DataOutputStream(os);
559            SequenceSet.Marshaller.INSTANCE.writePayload(freeList, dos);
560            dos.close();
561        }
562    
563        private void loadFreeList() throws IOException {
564            freeList.clear();
565            FileInputStream is = new FileInputStream(getFreeFile());
566            DataInputStream dis = new DataInputStream(is);
567            freeList = SequenceSet.Marshaller.INSTANCE.readPayload(dis);
568            dis.close();
569        }
570        
571        ///////////////////////////////////////////////////////////////////
572        // Property Accessors 
573        ///////////////////////////////////////////////////////////////////
574        
575        /**
576         * Is the recovery buffer used to double buffer page writes.  Enabled by default.
577         * 
578         * @return is the recovery buffer enabled.
579         */
580        public boolean isEnableRecoveryFile() {
581            return enableRecoveryFile;
582        }
583    
584        /**
585         * Sets if the recovery buffer uses to double buffer page writes.  Enabled by default.  Disabling this
586         * may potentially cause partial page writes which can lead to page file corruption.
587         */
588        public void setEnableRecoveryFile(boolean doubleBuffer) {
589            assertNotLoaded();
590            this.enableRecoveryFile = doubleBuffer;
591        }
592    
593        /**
594         * @return Are page writes synced to disk?
595         */
596        public boolean isEnableDiskSyncs() {
597            return enableDiskSyncs;
598        }
599    
600        /**
601         * Allows you enable syncing writes to disk.
602         * @param syncWrites
603         */
604        public void setEnableDiskSyncs(boolean syncWrites) {
605            assertNotLoaded();
606            this.enableDiskSyncs = syncWrites;
607        }
608        
609        /**
610         * @return the page size
611         */
612        public int getPageSize() {
613            return this.pageSize;
614        }
615    
616        /**
617         * @return the amount of content data that a page can hold.
618         */
619        public int getPageContentSize() {
620            return this.pageSize-Page.PAGE_HEADER_SIZE;
621        }
622        
623        /**
624         * Configures the page size used by the page file.  By default it is 4k.  Once a page file is created on disk,
625         * subsequent loads of that file will use the original pageSize.  Once the PageFile is loaded, this setting
626         * can no longer be changed.
627         * 
628         * @param pageSize the pageSize to set
629         * @throws IllegalStateException
630         *         once the page file is loaded.
631         */
632        public void setPageSize(int pageSize) throws IllegalStateException {
633            assertNotLoaded();
634            this.pageSize = pageSize;
635        }
636        
637        /**
638         * @return true if read page caching is enabled
639         */
640        public boolean isEnablePageCaching() {
641            return this.enablePageCaching;
642        }
643    
644        /**
645         * @param allows you to enable read page caching
646         */
647        public void setEnablePageCaching(boolean enablePageCaching) {
648            assertNotLoaded();
649            this.enablePageCaching = enablePageCaching;
650        }
651    
652        /**
653         * @return the maximum number of pages that will get stored in the read page cache.
654         */
655        public int getPageCacheSize() {
656            return this.pageCacheSize;
657        }
658    
659        /**
660         * @param Sets the maximum number of pages that will get stored in the read page cache.
661         */
662        public void setPageCacheSize(int pageCacheSize) {
663            assertNotLoaded();
664            this.pageCacheSize = pageCacheSize;
665        }
666    
667        public boolean isEnabledWriteThread() {
668            return enabledWriteThread;
669        }
670    
671        public void setEnableWriteThread(boolean enableAsyncWrites) {
672            assertNotLoaded();
673            this.enabledWriteThread = enableAsyncWrites;
674        }
675    
676        public long getDiskSize() throws IOException {
677            return toOffset(nextFreePageId.get());
678        }
679        
680        /**
681         * @return the number of pages allocated in the PageFile
682         */
683        public long getPageCount() {
684            return nextFreePageId.get();
685        }
686    
687        public int getRecoveryFileMinPageCount() {
688            return recoveryFileMinPageCount;
689        }
690    
691        public void setRecoveryFileMinPageCount(int recoveryFileMinPageCount) {
692            assertNotLoaded();
693            this.recoveryFileMinPageCount = recoveryFileMinPageCount;
694        }
695    
696        public int getRecoveryFileMaxPageCount() {
697            return recoveryFileMaxPageCount;
698        }
699    
700        public void setRecoveryFileMaxPageCount(int recoveryFileMaxPageCount) {
701            assertNotLoaded();
702            this.recoveryFileMaxPageCount = recoveryFileMaxPageCount;
703        }
704    
705            public int getWriteBatchSize() {
706                    return writeBatchSize;
707            }
708    
709            public void setWriteBatchSize(int writeBatchSize) {
710            assertNotLoaded();
711                    this.writeBatchSize = writeBatchSize;
712            }
713    
714            ///////////////////////////////////////////////////////////////////
715        // Package Protected Methods exposed to Transaction
716        ///////////////////////////////////////////////////////////////////
717    
718        /**
719         * @throws IllegalStateException if the page file is not loaded.
720         */
721        void assertLoaded() throws IllegalStateException {
722            if( !loaded.get() ) {
723                throw new IllegalStateException("PageFile is not loaded");
724            }
725        }
726        void assertNotLoaded() throws IllegalStateException {
727            if( loaded.get() ) {
728                throw new IllegalStateException("PageFile is loaded");
729            }
730        }
731            
732        /** 
733         * Allocates a block of free pages that you can write data to.
734         * 
735         * @param count the number of sequential pages to allocate
736         * @return the first page of the sequential set. 
737         * @throws IOException
738         *         If an disk error occurred.
739         * @throws IllegalStateException
740         *         if the PageFile is not loaded
741         */
742        <T> Page<T> allocate(int count) throws IOException {
743            assertLoaded();
744            if (count <= 0) {
745                throw new IllegalArgumentException("The allocation count must be larger than zero");
746            }
747    
748            Sequence seq = freeList.removeFirstSequence(count);
749    
750            // We may need to create new free pages...
751            if (seq == null) {
752    
753                Page<T> first = null;
754                int c = count;
755                while (c > 0) {
756                    Page<T> page = new Page<T>(nextFreePageId.getAndIncrement());
757                    page.makeFree(getNextWriteTransactionId());
758    
759                    if (first == null) {
760                        first = page;
761                    }
762    
763                    addToCache(page);
764                    DataByteArrayOutputStream out = new DataByteArrayOutputStream(pageSize);
765                    page.write(out);
766                    write(page, out.getData());
767    
768                    // LOG.debug("allocate writing: "+page.getPageId());
769                    c--;
770                }
771    
772                return first;
773            }
774    
775            Page<T> page = new Page<T>(seq.getFirst());
776            page.makeFree(0);
777            // LOG.debug("allocated: "+page.getPageId());
778            return page;
779        }
780    
781        long getNextWriteTransactionId() {
782            return nextTxid.incrementAndGet();
783        }
784    
785        void readPage(long pageId, byte[] data) throws IOException {
786            readFile.seek(toOffset(pageId));
787            readFile.readFully(data);
788        }
789    
790        public void freePage(long pageId) {
791            freeList.add(pageId);
792            if( enablePageCaching ) {
793                pageCache.remove(pageId);
794            }
795        }
796        
797        @SuppressWarnings("unchecked")
798        private <T> void write(Page<T> page, byte[] data) throws IOException {
799            final PageWrite write = new PageWrite(page, data);
800            Entry<Long, PageWrite> entry = new Entry<Long, PageWrite>(){
801                public Long getKey() {
802                    return write.getPage().getPageId();
803                }
804                public PageWrite getValue() {
805                    return write;
806                }
807                public PageWrite setValue(PageWrite value) {
808                    return null;
809                }
810            };
811            Entry<Long, PageWrite>[] entries = new Map.Entry[]{entry};
812            write(Arrays.asList(entries));
813        }
814    
815        void write(Collection<Map.Entry<Long, PageWrite>> updates) throws IOException {
816            synchronized( writes ) {
817                if( enabledWriteThread  ) {
818                    while( writes.size() >= writeBatchSize && !stopWriter.get() ) {
819                        try {
820                            writes.wait();
821                        } catch (InterruptedException e) {
822                            Thread.currentThread().interrupt();
823                            throw new InterruptedIOException();
824                        }
825                    }
826                }
827    
828                for (Map.Entry<Long, PageWrite> entry : updates) {
829                    Long key = entry.getKey();
830                    PageWrite value = entry.getValue();
831                    PageWrite write = writes.get(key);
832                    if( write==null ) {
833                        writes.put(key, value);
834                    } else {
835                        write.setCurrent(value.page, value.current);
836                    }
837                }
838                
839                // Once we start approaching capacity, notify the writer to start writing
840                if( canStartWriteBatch() ) {
841                    if( enabledWriteThread  ) {
842                        writes.notify();
843                    } else {
844                        writeBatch();
845                    }
846                }
847            }            
848        }
849        
850        private boolean canStartWriteBatch() {
851                    int capacityUsed = ((writes.size() * 100)/writeBatchSize);
852            if( enabledWriteThread ) {
853                // The constant 10 here controls how soon write batches start going to disk..
854                // would be nice to figure out how to auto tune that value.  Make to small and
855                // we reduce through put because we are locking the write mutex too often doing writes
856                return capacityUsed >= 10 || checkpointLatch!=null;
857            } else {
858                return capacityUsed >= 80 || checkpointLatch!=null;
859            }
860        }
861    
862        ///////////////////////////////////////////////////////////////////
863        // Cache Related operations
864        ///////////////////////////////////////////////////////////////////
865        @SuppressWarnings("unchecked")
866        <T> Page<T> getFromCache(long pageId) {
867            synchronized(writes) {
868                PageWrite pageWrite = writes.get(pageId);
869                if( pageWrite != null ) {
870                    return pageWrite.page;
871                }
872            }
873    
874            Page<T> result = null;
875            if (enablePageCaching) {
876                result = pageCache.get(pageId);
877            }
878            return result;
879        }
880    
881        void addToCache(Page page) {
882            if (enablePageCaching) {
883                pageCache.put(page.getPageId(), page);
884            }
885        }
886    
887        void removeFromCache(Page page) {
888            if (enablePageCaching) {
889                pageCache.remove(page.getPageId());
890            }
891        }
892    
893        ///////////////////////////////////////////////////////////////////
894        // Internal Double write implementation follows...
895        ///////////////////////////////////////////////////////////////////
896        /**
897         * 
898         */
899        private void pollWrites() {
900            try {
901                while( !stopWriter.get() ) {
902                    // Wait for a notification...
903                    synchronized( writes ) {  
904                        writes.notifyAll();
905                        
906                        // If there is not enough to write, wait for a notification...
907                        while( writes.isEmpty() && checkpointLatch==null && !stopWriter.get() ) {
908                            writes.wait(100);
909                        }
910                        
911                        if( writes.isEmpty() ) {
912                            releaseCheckpointWaiter();
913                        }
914                    }
915                    writeBatch();
916                }
917            } catch (Throwable e) {
918                e.printStackTrace();
919            } finally {
920                releaseCheckpointWaiter();
921            }
922        }
923    
924        /**
925         * 
926         * @param timeout
927         * @param unit
928         * @return true if there are still pending writes to do.
929         * @throws InterruptedException 
930         * @throws IOException 
931         */
932        private void writeBatch() throws IOException {
933                
934            CountDownLatch checkpointLatch;
935            ArrayList<PageWrite> batch;
936            synchronized( writes ) {
937                // If there is not enough to write, wait for a notification...
938    
939                batch = new ArrayList<PageWrite>(writes.size());
940                // build a write batch from the current write cache. 
941                for (PageWrite write : writes.values()) {
942                    batch.add(write);
943                    // Move the current write to the diskBound write, this lets folks update the 
944                    // page again without blocking for this write.
945                    write.begin();
946                }
947    
948                // Grab on to the existing checkpoint latch cause once we do this write we can 
949                // release the folks that were waiting for those writes to hit disk.
950                checkpointLatch = this.checkpointLatch;
951                this.checkpointLatch=null;
952            }
953            
954     
955           if (enableRecoveryFile) {
956               
957               // Using Adler-32 instead of CRC-32 because it's much faster and it's 
958               // weakness for short messages with few hundred bytes is not a factor in this case since we know 
959               // our write batches are going to much larger.
960               Checksum checksum = new Adler32();
961               for (PageWrite w : batch) {
962                   checksum.update(w.diskBound, 0, pageSize);
963               }
964               
965               // Can we shrink the recovery buffer??
966               if( recoveryPageCount > recoveryFileMaxPageCount ) {
967                   int t = Math.max(recoveryFileMinPageCount, batch.size());
968                   recoveryFile.setLength(recoveryFileSizeForPages(t));
969               }
970               
971                // Record the page writes in the recovery buffer.
972                recoveryFile.seek(0);
973                // Store the next tx id...
974                recoveryFile.writeLong(nextTxid.get());
975                // Store the checksum for thw write batch so that on recovery we know if we have a consistent 
976                // write batch on disk.
977                recoveryFile.writeLong(checksum.getValue());
978                // Write the # of pages that will follow
979                recoveryFile.writeInt(batch.size());
980                
981                
982                // Write the pages.
983                recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
984                for (PageWrite w : batch) {
985                    recoveryFile.writeLong(w.page.getPageId());
986                    recoveryFile.write(w.diskBound, 0, pageSize);
987                }
988                
989                if (enableDiskSyncs) {
990                    // Sync to make sure recovery buffer writes land on disk..
991                    recoveryFile.getFD().sync();
992                }
993                
994                recoveryPageCount = batch.size();
995            }
996           
997            
998            for (PageWrite w : batch) {
999                writeFile.seek(toOffset(w.page.getPageId()));
1000                writeFile.write(w.diskBound, 0, pageSize);
1001            }
1002            
1003            // Sync again
1004            if( enableDiskSyncs ) {
1005                writeFile.getFD().sync();
1006            }
1007            
1008            synchronized( writes ) {
1009                for (PageWrite w : batch) {
1010                    // If there are no more pending writes, then remove it from the write cache.
1011                    if( w.done() ) {
1012                        writes.remove(w.page.getPageId());
1013                    }
1014                }
1015            }
1016            
1017            if( checkpointLatch!=null ) {
1018                checkpointLatch.countDown();
1019            }
1020        }
1021    
1022        private long recoveryFileSizeForPages(int pageCount) {
1023            return RECOVERY_FILE_HEADER_SIZE+((pageSize+8)*pageCount);
1024        }
1025    
1026        private void releaseCheckpointWaiter() {
1027            if( checkpointLatch!=null ) {
1028                checkpointLatch.countDown();
1029                checkpointLatch=null;
1030            }
1031        }       
1032        
1033        /**
1034         * Inspects the recovery buffer and re-applies any 
1035         * partially applied page writes.
1036         * 
1037         * @return the next transaction id that can be used.
1038         * @throws IOException
1039         */
1040        private long redoRecoveryUpdates() throws IOException {
1041            if( !enableRecoveryFile ) {
1042                return 0;
1043            }
1044            recoveryPageCount=0;
1045            
1046            // Are we initializing the recovery file?
1047            if( recoveryFile.length() == 0 ) {
1048                // Write an empty header..
1049                recoveryFile.write(new byte[RECOVERY_FILE_HEADER_SIZE]);
1050                // Preallocate the minium size for better performance.
1051                recoveryFile.setLength(recoveryFileSizeForPages(recoveryFileMinPageCount));
1052                return 0;
1053            }
1054            
1055            // How many recovery pages do we have in the recovery buffer?
1056            recoveryFile.seek(0);
1057            long nextTxId = readFile.readLong();
1058            long expectedChecksum = readFile.readLong();
1059            int pageCounter = readFile.readInt();
1060            
1061            recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
1062            Checksum checksum = new Adler32();
1063            LinkedHashMap<Long, byte[]> batch = new LinkedHashMap<Long, byte[]>();
1064            try {
1065                for (int i = 0; i < pageCounter; i++) {
1066                    long offset = recoveryFile.readLong();
1067                    byte []data = new byte[pageSize];
1068                    if( recoveryFile.read(data, 0, pageSize) != pageSize ) {
1069                        // Invalid recovery record, Could not fully read the data". Probably due to a partial write to the recovery buffer
1070                        return nextTxId;
1071                    }
1072                    checksum.update(data, 0, pageSize);
1073                    batch.put(offset, data);
1074                }
1075            } catch (Exception e) {
1076                // If an error occurred it was cause the redo buffer was not full written out correctly.. so don't redo it. 
1077                // as the pages should still be consistent.
1078                LOG.debug("Redo buffer was not fully intact: ", e);
1079                return nextTxId;
1080            }
1081            
1082            recoveryPageCount = pageCounter;
1083            
1084            // If the checksum is not valid then the recovery buffer was partially written to disk.
1085            if( checksum.getValue() != expectedChecksum ) {
1086                return nextTxId;
1087            }
1088            
1089            // Re-apply all the writes in the recovery buffer.
1090            for (Map.Entry<Long, byte[]> e : batch.entrySet()) {
1091                writeFile.seek(e.getKey());
1092                e.getValue();
1093                writeFile.write(e.getValue());
1094            }
1095            
1096            // And sync it to disk
1097            writeFile.getFD().sync();
1098            return nextTxId;
1099        }
1100    
1101        private void startWriter() {
1102            synchronized( writes ) {
1103                if( enabledWriteThread ) {
1104                    stopWriter.set(false);
1105                    writerThread = new Thread("KahaDB Page Writer") {
1106                        @Override
1107                        public void run() {
1108                            pollWrites();
1109                        }
1110                    };
1111                    writerThread.setPriority(Thread.MAX_PRIORITY);
1112                    writerThread.setDaemon(true);
1113                    writerThread.start();
1114                }
1115            }
1116        }
1117     
1118        private void stopWriter() throws InterruptedException {
1119            if( enabledWriteThread ) {
1120                stopWriter.set(true);
1121                writerThread.join();
1122            }
1123        }
1124    
1125            public File getFile() {
1126                    return getMainPageFile();
1127            }
1128    
1129    }